This notebook identifies candidate stabilizing proline mutations using cell entry DMS data and structural features.

In [None]:
import pandas as pd
from tempfile import gettempdir
import numpy as np
import biotite.database.rcsb as rcsb
import biotite.sequence as seq
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.structure.io.pdbx as pdbx

import hydride

In [None]:
# load entry data
entry_df = pd.read_csv(
    "../../results/filtered_data/cell_entry/Nipah_F_func_effects_filtered.csv"
)
display(entry_df)

In [None]:
# load pdb
file_name = rcsb.fetch("5EVM", "cif", gettempdir())
structure = strucio.load_structure(file_name)

In [None]:
structure = structure[(structure.chain_id == "A") | (structure.chain_id == "B") | (structure.chain_id == "C")]

In [None]:
# Get hydrogen bond interactions with hydride and biotite package
structure.bonds = struc.connect_via_residue_names(structure)
charges = hydride.estimate_amino_acid_charges(structure, ph=7.0)
structure.set_annotation("charge", charges)
structure, mask = hydride.add_hydrogen(structure)
structure.coord = hydride.relax_hydrogen(structure)

triplets = struc.hbond(structure, cutoff_dist=3)

# Extract information for each hbond interaction
data = []
for donor, _, acceptor in triplets:
    data.append(
        {
            "donor_chain": structure.chain_id[donor],
            "donor_resid": structure.res_id[donor],
            "donor_charge": structure.charge[donor],
            "donor_resname": structure.res_name[donor],
            "donor_atom": structure.atom_name[donor],
            "acceptor_chain": structure.chain_id[acceptor],
            "acceptor_resid": structure.res_id[acceptor],
            "acceptor_charge": structure.charge[acceptor],
            "acceptor_resname": structure.res_name[acceptor],
            "acceptor_atom": structure.atom_name[acceptor],
        }
    )


In [None]:
# Create DataFrame from above data
df = pd.DataFrame(data)
# Filter for sidechain-sidechain hydrogen bonds only and remove non peptide residues
non_peptide_residues = ["NAG", "MAN", "MLI"]
sidechain_h_bonds = df.query(
    'donor_atom not in ["C", "N", "CA", "O"] and acceptor_atom not in ["C", "N", "CA", "O"]'
).query('donor_resname not in @non_peptide_residues and acceptor_resname not in @non_peptide_residues')

# get lists of sites involved in sidechain-sidechain hydrogen bonds
donor_site_list = sidechain_h_bonds["donor_resid"].unique().tolist()
acceptor_site_list = sidechain_h_bonds["acceptor_resid"].unique().tolist()

# combined donor and acceptor sites into one list
hydrogen_bonding_sites = sorted(set(donor_site_list + acceptor_site_list))

# print list of sites involved in sidechain-sidechain hydrogen bonds
print(hydrogen_bonding_sites)


In [None]:
#salt bridges
# get list of sites involved in salt bridges
salt_bridge_sites = sidechain_h_bonds.query('(donor_charge == 1 and acceptor_charge == -1) or (donor_charge == -1 and acceptor_charge == 1)')
salt_bridge_site_list = salt_bridge_sites["donor_resid"].unique().tolist() + salt_bridge_sites["acceptor_resid"].unique().tolist()
salt_bridge_site_list = sorted(set(salt_bridge_site_list))
print(salt_bridge_site_list)

In [None]:
# get hbonds between chains
display(sidechain_h_bonds.query('donor_chain != acceptor_chain').head(3))
# get possible problematic hbonds 
display(sidechain_h_bonds.query('donor_charge == acceptor_charge and donor_charge != 0').head(3))

In [None]:
#get secondary structure features using biotite
pdbx_file = pdbx.CIFFile.read(file_name)
sse = pdbx.get_sse(pdbx_file)['A']

# Function to add secondary structure feature to annotation
def _add_sec_str(annotation, first, last, str_type):
        if str_type == "a":
            str_type = "helix"
        elif str_type == "b":
            str_type = "sheet"
        else:
            # coil
            return
        feature = seq.Feature(
            "SecStr", [seq.Location(first, last)], {"sec_str_type": str_type}
        )
        annotation.add_feature(feature)

# Find the intervals for each secondary structure element
# and add to annotation
annotation = seq.Annotation()
curr_sse = None
curr_start = None
for i in range(len(sse)):
    if curr_start is None:
        curr_start = i
        curr_sse = sse[i]
    else:
        if sse[i] != sse[i - 1]:
            _add_sec_str(
                annotation, curr_start + 1, i - 1 + 1, curr_sse
            )
            curr_start = i
            curr_sse = sse[i]

# Add last secondary structure element to annotation
_add_sec_str(annotation, curr_start + 1, i - 1 + 1, curr_sse)

# Extract helix and sheet data from annotation
helix_data = []
sheet_data = []
for f in sorted(list(annotation)):
    sec_str_type = str(f.qual["sec_str_type"])
    start = int(f.get_location_range()[0])
    end = int(f.get_location_range()[1])
    feature_range = list(range(start, end + 1))
    #print(test)
    if sec_str_type == "helix":
        helix_data.append(feature_range)
    elif sec_str_type == "sheet":
        sheet_data.append(feature_range)


# Create DataFrames from helix and sheet data
helix_sites = [num for sublist in helix_data for num in sublist]
sheet_sites = [num for sublist in sheet_data for num in sublist]
print(helix_sites)
print(sheet_sites)

In [None]:
# Calculate std deviation of effects for each site in cavity and remove sites
# with low variation or all negative effects, they are not interesting
high_variation_df = (
    entry_df.groupby("site")
    .agg(
        effect_max=("effect", "max"),
        effect_std=("effect", "std"),
        effect_min_top4=("effect", lambda x: x.nlargest(4).min()),
    )
    .reset_index()
)

# Use query for cleaner filtering
high_variation_sites = high_variation_df.query(
    "effect_std >= 0.5 and effect_min_top4 >= -1"
)["site"].tolist()

print(f"There are {len(high_variation_sites)} high variation sites")

# Find all sites where Proline mutations have effect < -3
meets_condition = entry_df["mutant"].eq("P") & entry_df["effect"].lt(-3)

# Find sites where ALL rows meet the condition
valid_sites = (
    entry_df.assign(meets_condition=meets_condition)
    .groupby("site")["meets_condition"]
    .any()
    .loc[lambda x: x]
    .index.tolist()
)
print(f'There are {len(valid_sites)} valid sites where all Proline mutations have effect < -3')

In [None]:
site_ranges_list = list(range(49, 100)) + list(range(125, 286))
print(site_ranges_list)

In [None]:
# annotate entry data with secondary structure and h-bond/salt bridge info
entry_df = entry_df.assign(
    helix_site=entry_df["site"].isin(helix_sites),
    sheet_site=entry_df["site"].isin(sheet_sites),
    hydrogen_bonding_sites=entry_df["site"].isin(hydrogen_bonding_sites),
    salt_bridge_site=entry_df["site"].isin(salt_bridge_site_list),
    high_variation_sites=entry_df["site"].isin(high_variation_sites),
    valid_sites=entry_df["site"].isin(valid_sites),
    site_ranges_list=entry_df["site"].isin(site_ranges_list)
)
display(entry_df)

In [None]:
# All candidate stabilizing proline mutation sites
print('All sites')

total_site_list = (
    entry_df.query("not helix_site and not sheet_site and not hydrogen_bonding_sites and not salt_bridge_site and high_variation_sites and valid_sites and site_ranges_list")["site"]
    .unique()
    .tolist()
)
print(f"Total sites: {len(total_site_list)}")
print(total_site_list)

