In [None]:
# this notebook finds candidate stabilizing space-filling mutations using cell entry DMS data and cavity data generated from pyKVFinder

In [None]:
#import tomllib
import pandas as pd
import tomllib

In [None]:
h_bond_sites = [
    30,
    31,
    44,
    45,
    46,
    50,
    56,
    64,
    66,
    82,
    84,
    88,
    98,
    103,
    135,
    138,
    139,
    140,
    143,
    146,
    148,
    152,
    156,
    163,
    164,
    166,
    167,
    177,
    178,
    181,
    182,
    189,
    193,
    194,
    196,
    198,
    200,
    204,
    208,
    217,
    219,
    224,
    238,
    240,
    244,
    248,
    254,
    258,
    259,
    261,
    265,
    270,
    273,
    275,
    279,
    281,
    286,
    287,
    294,
    295,
    300,
    307,
    325,
    336,
    341,
    342,
    343,
    344,
    350,
    351,
    353,
    354,
    359,
    362,
    366,
    372,
    375,
    386,
    389,
    394,
    395,
    396,
    413,
    415,
    432,
    451,
    455,
    466,
    468,
    471,
]

salt_bridge_sites = [44, 45, 82, 136, 139, 240, 287, 295]
helix_sites = [
    29,
    30,
    31,
    32,
    33,
    34,
    35,
    65,
    66,
    67,
    68,
    69,
    74,
    75,
    76,
    77,
    78,
    79,
    80,
    81,
    82,
    83,
    84,
    85,
    86,
    87,
    88,
    89,
    90,
    91,
    92,
    93,
    94,
    95,
    96,
    97,
    98,
    99,
    117,
    118,
    119,
    120,
    121,
    125,
    126,
    127,
    128,
    129,
    130,
    131,
    132,
    133,
    134,
    135,
    136,
    137,
    138,
    139,
    140,
    141,
    142,
    143,
    144,
    145,
    146,
    147,
    148,
    149,
    150,
    151,
    152,
    153,
    154,
    175,
    176,
    177,
    178,
    179,
    180,
    181,
    182,
    183,
    184,
    185,
    186,
    187,
    193,
    194,
    195,
    196,
    197,
    198,
    199,
    200,
    201,
    202,
    203,
    204,
    205,
    206,
    207,
    208,
    209,
    210,
    211,
    212,
    213,
    214,
    215,
    228,
    229,
    230,
    231,
    232,
    233,
    234,
    235,
    236,
    238,
    239,
    240,
    241,
    242,
    243,
    244,
    245,
    252,
    253,
    254,
    255,
    256,
    257,
    258,
    259,
    328,
    329,
    330,
    331,
    349,
    350,
    351,
    352,
    353,
    354,
    355,
    356,
    357,
    358,
    359,
    360,
    361,
    362,
    363,
    437,
    438,
    439,
    440,
    441,
    442,
    452,
    453,
    454,
    455,
    456,
    457,
    458,
    459,
    460,
    461,
    462,
    463,
    464,
    465,
    466,
    467,
    468,
    469,
    470,
    471,
    472,
    473,
    474,
    475,
    476,
    477,
    478,
    479,
    480,
    481,
    482,
]

sheet_sites = [
    38,
    39,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    47,
    48,
    53,
    54,
    55,
    56,
    57,
    58,
    59,
    60,
    101,
    102,
    103,
    113,
    114,
    115,
    116,
    122,
    123,
    124,
    158,
    159,
    160,
    161,
    169,
    170,
    171,
    172,
    173,
    226,
    227,
    263,
    264,
    265,
    266,
    267,
    268,
    269,
    270,
    275,
    276,
    277,
    278,
    279,
    280,
    281,
    282,
    286,
    287,
    288,
    289,
    290,
    291,
    292,
    293,
    294,
    295,
    296,
    297,
    298,
    301,
    302,
    307,
    308,
    309,
    310,
    315,
    316,
    317,
    318,
    319,
    322,
    323,
    324,
    325,
    326,
    332,
    333,
    337,
    338,
    339,
    340,
    345,
    346,
    365,
    366,
    367,
    376,
    377,
    378,
    379,
    382,
    383,
    384,
    385,
    392,
    393,
    394,
    410,
    411,
    412,
    419,
    420,
    421,
    422,
    425,
    426,
    427,
    428,
]


In [None]:
# input files
entry_df = pd.read_csv(
    "../../results/filtered_data/cell_entry/Nipah_F_func_effects_filtered.csv"
).drop(columns=["effect_std", "n_selections"])



In [None]:
# Read the TOML file
with open("../../data/stabilizing_data/5EVM_cavities_output.toml", "rb") as f:
    data = tomllib.load(f)

# Extract the RESULTS section
results = data['RESULTS']

# Get all the cavity IDs (KAA, KAB, etc.)
cavity_ids = list(results['RESIDUES'].keys())

# Create lists to store the data
rows = []

# Iterate through each cavity
for cavity_id in cavity_ids:
    # Get the residues for this cavity
    residues = results['RESIDUES'][cavity_id]
    for residue in residues:
        site, chain, residue_type = residue
        rows.append({
            'cavity_id': cavity_id,
            'site': site,
            'chain': chain,
            'wildtype': residue_type,
            'area': results['AREA'][cavity_id],
            'volume': results['VOLUME'][cavity_id],
            'avg_depth': results['AVG_DEPTH'][cavity_id],
            'max_depth': results['MAX_DEPTH'][cavity_id],
            'avg_hydropathy': results['AVG_HYDROPATHY'][cavity_id]
        })

# Create DataFrame
df = pd.DataFrame(rows)
df['site'] = df['site'].astype(int)
df = df.query('site <= 482') #filter out any residues that are not in DMS data

# add structural annotations
df = df.sort_values("area", ascending=False).assign(
    h_bond_sites=lambda x: x["site"].isin(h_bond_sites),
    salt_bridge_sites=lambda x: x["site"].isin(salt_bridge_sites),
    helix_sites=lambda x: x["site"].isin(helix_sites),
    sheet_sites=lambda x: x["site"].isin(sheet_sites),
)



display(df.query('area < 20'))

In [None]:
# get list of sites with low accessibility, we want to keep these
low_accessibility_sites = (
    pd.read_csv("../../results/residue_accessibility/5evm_accessibility.csv")
    .query("mean_accessibility < 10")["site"]
    .tolist()
)
print(f"Number of low accessibility sites: {len(low_accessibility_sites)}")
# get list of all cavities in structure
cavity_list = df['cavity_id'].unique().astype(str).tolist()

empty_list = [] # to store final dfs for each cavity
# iterate through each cavity
for cavity in cavity_list:
    # pull out sites specific for this cavity
    subset = df[df['cavity_id'] == cavity].drop_duplicates(subset=['site'])
    
    # generate list of sites in cavity
    sites_subset = sorted(set(subset['site'].unique().tolist()))
    
    # extract effects for these sites
    effect_subset = entry_df[entry_df['site'].isin(sites_subset)]
    
    # merge effects with cavity subset
    merged_subsets = pd.merge(effect_subset, subset, on=['site', 'wildtype'], how='left')

    # calculate std deviation of effects for each site in cavity and remove sites with low variation, they are not interesting
    high_variation_df = (
        merged_subsets.groupby("site")
        .agg(
            effect_max=("effect", "max"),
            effect_std=("effect", "std"),
            effect_min_top3=("effect", lambda x: x.nlargest(3).min()),
        )
        .reset_index()
    )

    high_variation_sites = high_variation_df.query(
        "effect_std >= 0.5 and effect_min_top3 >= -1"
    )["site"].tolist()
    
    # remove sites that are involved in H-bonds or salt bridges and only keep sites with high variation in effects
    merged_subsets = merged_subsets.assign(is_high_variation_site = merged_subsets['site'].isin(high_variation_sites), low_accessibility_site=merged_subsets['site'].isin(low_accessibility_sites))

    
    
    empty_list.append(merged_subsets)
    
final_concat = pd.concat(empty_list, ignore_index=True)
final_concat = final_concat.drop_duplicates(['site', 'mutant'])
#display(final_concat.query('site == 172'))

final_concat = final_concat.query('is_high_variation_site and low_accessibility_site')
display(final_concat)

In [None]:
entry_df = entry_df.assign(
    h_bond_sites=lambda x: x["site"].isin(h_bond_sites),
    salt_bridge_sites=lambda x: x["site"].isin(salt_bridge_sites),
    helix_sites=lambda x: x["site"].isin(helix_sites),
    sheet_sites=lambda x: x["site"].isin(sheet_sites),
)

merged_entry_acc = entry_df.query("site in @low_accessibility_sites").query('h_bond_sites == False and salt_bridge_sites == False')
display(merged_entry_acc)

In [None]:
# function to assign amino acid type
def assign_aa_type(site_num):
    if site_num in ["D", "E"]:
        return "Negative"
    elif site_num in ["K", "R", "H"]:
        return "Positive"
    elif site_num in ["Q", "N", "S", "T"]:
        return "Hydrophilic"
    elif site_num in ["A", "I", "L", "M", "V"]:
        return "Hydrophobic"
    elif site_num in ["Y", "W", "F"]:
        return "Aromatic"
    elif site_num in ["C", "G", "P"]:
        return "Special"
    else:
        return "Other"


final_concat["mutant_type"] = final_concat["mutant"].apply(assign_aa_type)
final_concat["wildtype_type"] = final_concat["wildtype"].apply(assign_aa_type)


final_concat = final_concat.query('h_bond_sites == False')
display(final_concat)


grouped_aa_props = final_concat.groupby(['site', 'mutant_type']).agg(
    mean_effect = ('effect', 'mean'),
    max_effect = ('effect', 'max'),
    min_effect = ('effect', 'min'),
    n_mutants = ('mutant', 'nunique'),
    wildtype = ('wildtype_type', 'first'),
).reset_index()



In [None]:
merged_entry_acc = merged_entry_acc.assign(
    mutant_type=lambda x: x["mutant"].apply(assign_aa_type),
    wildtype_type=lambda x: x["wildtype"].apply(assign_aa_type),
)
#display(merged_entry_acc)

grouped_aa_entry_all = (
    merged_entry_acc.groupby(["site", "mutant_type"])
    .agg(
        mean_effect=("effect", "mean"),
        #max_effect=("effect", "max"),
        #min_effect=("effect", "min"),
        n_mutants=("mutant", "nunique"),
        wildtype=("wildtype_type", "first"),
    )
    .reset_index()
)
display(grouped_aa_entry_all)

pivoted = grouped_aa_entry_all.pivot_table(
    index=["site", "wildtype"],
    columns="mutant_type",
    values=["mean_effect"],
)
display(pivoted)

sites_meeting_criteria = (
    pivoted[
        (
            (
                pivoted["mean_effect"]["Hydrophobic"]
                < pivoted["mean_effect"]["Hydrophilic"]
            )
            | (
                pivoted["mean_effect"]["Aromatic"]
                < pivoted["mean_effect"]["Hydrophilic"]
            )
        )
        & (
            (pivoted.index.get_level_values("wildtype") == "Hydrophilic")
            | (pivoted.index.get_level_values("wildtype") == "Positive")
            | (pivoted.index.get_level_values("wildtype") == "Negative")
        )
    ]
    .index.get_level_values(0)
    .tolist()
)

print(
    f"Hydrophilic sites: {sites_meeting_criteria} \n(n={len(sites_meeting_criteria)})"
)

In [None]:
pivoted = grouped_aa_props.pivot_table(
    index=["site", "wildtype"], columns="mutant_type", values=["mean_effect", "max_effect", "min_effect"]
)
display(pivoted)

# find sites where aromatic mutations are more deleterious than hydrophobic mutations
aromatic_more_deleterious = (
    pivoted[
        (pivoted["mean_effect"]["Aromatic"] < pivoted["mean_effect"]["Hydrophobic"])
        & (pivoted.index.get_level_values("wildtype") != "Hydrophilic")
    ]
    .index.get_level_values(0)
    .tolist()
)
print(f"Aromatic more deleterious than hydrophobic: {aromatic_more_deleterious} \n(n={len(aromatic_more_deleterious)})")


# Find sites where both hydrophobic and aromatic are lower than hydrophilic
sites_meeting_criteria = (
    pivoted[
        ((pivoted["mean_effect"]["Hydrophobic"] < pivoted["mean_effect"]["Hydrophilic"])
        | (pivoted["mean_effect"]["Aromatic"] < pivoted["mean_effect"]["Hydrophilic"]))
        & (
            (pivoted.index.get_level_values("wildtype") == "Hydrophilic")
            | (pivoted.index.get_level_values("wildtype") == "Positive")
            | (pivoted.index.get_level_values("wildtype") == "Negative")
        )
    ]
    .index.get_level_values(0)
    .tolist()
)

print(f"Hydrophilic sites: {sites_meeting_criteria} \n(n={len(sites_meeting_criteria)})")

In [None]:
display(pivoted.query('site in @sites_meeting_criteria'))