### Entry correlations residue accessibility
Calculate residue accessibility from PDB and compare to entry scores from DMS

In [1]:
import pandas as pd
import altair as alt
import httpimport

import numpy as np
import scipy.stats

from Bio.PDB import PDBParser
from Bio.PDB.SASA import ShrakeRupley

_ = alt.data_transformers.disable_max_rows()

In [2]:
# Import custom altair theme from remote github using httpimport module
def import_theme_new():
    with httpimport.github_repo("bblarsen-sci", "altair_themes", "main"):
        import main_theme

        @alt.theme.register("custom_theme", enable=True)
        def custom_theme():
            return main_theme.main_theme()


import_theme_new()

### Import Nipah F structure PDB and parse it to get residue accessibility
https://biopython.org/docs/dev/api/Bio.PDB.SASA.html

In [None]:
p = PDBParser(QUIET=1)
structure = p.get_structure("5EVM", snakemake.input.pdb)
sr = ShrakeRupley()
sr.compute(structure, level="R")

In [None]:
# get sites and accessibility for each residue in all three chains
df = pd.DataFrame(
    [
        {
            "site": i,
            "accessibility_A": round(structure[0]["A"][i].sasa, 2)
            if i in structure[0]["A"]
            else np.nan,
            "accessibility_B": round(structure[0]["B"][i].sasa, 2)
            if i in structure[0]["B"]
            else np.nan,
            "accessibility_C": round(structure[0]["C"][i].sasa, 2)
            if i in structure[0]["C"]
            else np.nan,
        }
        for i in range(29, 482)
    ]
)

# calculate mean accessibility across chains
df["mean_accessibility"] = (
    df[["accessibility_A", "accessibility_B", "accessibility_C"]].mean(axis=1).round(2)
)

# save accessibility data to csv
df.to_csv(snakemake.output.accessibility_df, index=False)

### Read in entry scores from DMS and merge with residue accessibility

In [None]:
# read in mean entry data
entry_df = pd.read_csv(snakemake.input.entry_df)

# merge with accessibility data
merged_df = pd.merge(df, entry_df, on="site", how="left")

# Assign amino acid type
def assign_aa_type(site_num):
    if site_num in ["D", "E"]:
        return "Negative"
    elif site_num in ["K", "R", "H"]:
        return "Positive"
    elif site_num in ["Q", "N", "S", "T"]:
        return "Hydrophilic"
    elif site_num in ["A", "I", "L", "M", "V"]:
        return "Hydrophobic"
    elif site_num in ["Y", "W", "F"]:
        return "Aromatic"
    elif site_num in ["C", "G", "P"]:
        return "Special"
    else:
        return "Other"

merged_df["wildtype_type"] = merged_df["wildtype"].apply(assign_aa_type)

In [None]:
##### calculate R value:
for i in merged_df["wildtype_type"].unique():
    tmp_df = merged_df[merged_df["wildtype_type"] == i]
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        tmp_df["mean_accessibility"], tmp_df["effect"]
    )
    r_value = float(r_value)
    print(f"Residue type: {i},  r_value: {r_value:.2f}, p_value: {p_value:.2e}")

slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    merged_df["mean_accessibility"], merged_df["effect"]
)
r_value = float(r_value)
print(f"Overall r_value: {r_value:.2f}, p_value: {p_value:.2e}")

In [None]:
# plot accessibility of chain A vs chain B, colored by amino acid type
accessibility_chart = (
    alt.Chart(merged_df)
    .mark_circle(size=60)
    .encode(
        x=alt.X("accessibility_A", title="A Accessibility (Å²)"),
        y=alt.Y("accessibility_B", title="B Accessibility (Å²)"),
        color=alt.Color("wildtype_type"),
        tooltip=["site", "wildtype", "accessibility_B", "effect"],
    )
)
display(accessibility_chart)

In [None]:
# plot mean accessibility vs effect, with r value
chart = (
    alt.Chart(merged_df)
    .mark_circle(size=50, opacity=1, stroke="black", strokeWidth=0.75)
    .encode(
        x=alt.X("mean_accessibility", title="Surface Accessibility (Å²)"),
        y=alt.Y("effect", title="Mean Entry of Mutations at Site"),
        tooltip=["site", "mean_accessibility", "effect", "wildtype"],
    )
    .properties(width=200, height=200)
)
text = (
    alt.Chart(
        {
            "values": [
                {
                    "x": 0,
                    "y": 0,
                    "text": f"r = {r_value:.2f}",
                }
            ]
        }
    )
    .mark_text(
        align="left",
        baseline="top",
        dx=10,
        dy=-15,
    )
    .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)

final_chart = chart + text
display(final_chart)

In [None]:
# save
final_chart.save(snakemake.output.access_vs_effect_all_png, ppi=300)
final_chart.save(snakemake.output.access_vs_effect_all_svg)

In [None]:
# plot mean accessibility vs effect, colored by amino acid type, faceted
chart = (
    alt.Chart(merged_df)
    .mark_circle(size=50, opacity=1, stroke="black", strokeWidth=0.75)
    .encode(
        x=alt.X(
            "mean_accessibility",
            title="Surface Accessibility (Å²)",
        ),
        y=alt.Y("effect", title="Mean Entry Score at Site"),
        color=alt.Color("wildtype_type", legend=None),
        facet=alt.Facet("wildtype_type", title=None, columns=3, bounds="full"),
        tooltip=["site", "mean_accessibility", "effect", "wildtype"],
    )
).properties(
    width=200,
    height=200,
)
display(chart)

In [None]:
# save
chart.save(snakemake.output.access_vs_effect_residue_type_png, ppi=300)
chart.save(snakemake.output.access_vs_effect_residue_type_svg)