# Compare natural variation of GPC to DMS scores for specific regions of the protein

This notebook compares functional effects of mutations and the natural variation (i.e., effective amino acids) present at each site for different regions of GPC.

In [None]:
# Imports
import os
import warnings
import scipy as sp
import pandas as pd
import altair as alt

# Plotting colors
# re-arranged for plot
tol_muted_adjusted = [
    "#AA4499",
    "#88CCEE",
    "#EE7733",
    "#44AA99",
    "#1f78b4",
    "#CC6677",
    "#117733",
    "#999933",
    "#DDCC77",
    "#CC3311",
    "#882255",
    "#000000",
    "#DDDDDD",
]

# Suppress warnings
warnings.simplefilter("ignore")

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

In [None]:
# this cell is tagged as `parameters` for papermill parameterization
natural_sequence_variation = None
filtered_func_293T = None

In [None]:
# # Uncomment for running interactive
# natural_sequence_variation = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
# filtered_func_293T = "../results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"

In [None]:
# Load data as dataframe
natural_df = pd.read_csv(natural_sequence_variation)
functional_scores = pd.read_csv(filtered_func_293T)

# Label SSP, GP1, and GP2 regions of GPC
functional_scores["region"] = (
    functional_scores.apply(
        lambda x: "SSP" if x["site"] <= 58 else ("GP1" if x["site"] <= 259 else "GP2"), axis=1
    )
)

# Add GP2 transmembrane domain (428 - 447)
TM = list(range(428,448))
TM_sites = (
    functional_scores.loc[functional_scores["site"].isin(TM)].copy()
)
TM_sites["region"] = "TM"
functional_scores = (
    pd.concat([
        functional_scores, 
        TM_sites,
    ], ignore_index = True)
)

# Add GP2 cytoplasmic tail (448 - 491)
CT = range(448,492)
CT_sites = (
    functional_scores.loc[functional_scores["site"].isin(CT)].copy()
)
CT_sites["region"] = "CT"
functional_scores = (
    pd.concat([
        functional_scores, 
        CT_sites,
    ], ignore_index = True)
)

# Alpha dystroglycan binding sites
DG_sites = [
    120,
    121,
    125,
    150, # supported by 151 and 125 interactions
    151,
    256,
    257,
    258,
]

# Add DG binding sites
DG_binding_sites = (
    functional_scores.loc[functional_scores["site"].isin(DG_sites)].copy()
)
DG_binding_sites["region"] = "\u03B1-DG binding sites"
functional_scores = (
    pd.concat([
        functional_scores, 
        DG_binding_sites,
    ], ignore_index = True)
)

# LAMP1 binding sites
LAMP1_sites = [
    92, # histidine triad
    93, # histidine triad
    172,
    173,
    188,
    192,
    195,
    197,
    198,
    200,
    201,
    202,
    204,
    206,
    207,
    211,
    216,
    230, # histidine triad
]

# Add LAMP1 binding sites
LAMP1_binding_sites = (
    functional_scores.loc[functional_scores["site"].isin(LAMP1_sites)].copy()
)
LAMP1_binding_sites["region"] = "LAMP1 binding sites"
functional_scores = (
    pd.concat([
        functional_scores, 
        LAMP1_binding_sites,
    ], ignore_index = True)
)

# Add glycosylation sites N - X - S/T
glycans = [
    79,80,81,
    89,90,91,
    99,100,101,
    109,110,111,
    119,120,121,
    167,168,169,
    224,225,226,
    365,366,367,
    373,374,375,
    390,391,392,
    395,396,397,
]
glycan_sites = (
    functional_scores.loc[functional_scores["site"].isin(glycans)].copy()
)
glycan_sites["region"] = "N-glycosylation sites"
functional_scores = (
    pd.concat([
        functional_scores, 
        glycan_sites,
    ], ignore_index = True)
)

# Average site functional effects
functional_scores = (
    functional_scores.groupby(["site", "wildtype", "region"])
    .aggregate({
        "effect" : "mean"
    })
    .reset_index()
)

# Add natural variation (effective amino acids) per site
functional_scores = (
    functional_scores.merge(
        natural_df[["site", "n_effective"]],
        how="left",
        on=["site"],
        validate="many_to_one",
    )
)

Plot correlations of site mean effects on cell entry and site effective amino acids for each region of GPC.

In [None]:
subplots = []
for index,region in enumerate(["SSP", "GP1", "GP2", "TM", "CT", "\u03B1-DG binding sites", "LAMP1 binding sites", "N-glycosylation sites",]):

    # Calculate statistics
    r, p = sp.stats.pearsonr(
        functional_scores.loc[(functional_scores["region"] == region)].dropna()["effect"], 
        functional_scores.loc[(functional_scores["region"] == region)].dropna()["n_effective"]
    )
    print(f"r correlation for {region}: {r:.2f}")

    curr_subplot = alt.Chart(functional_scores.query("region == @region"), title=region).mark_point(
        filled=True, 
        color=tol_muted_adjusted[index], 
        size=75,
        opacity=0.25,
    ).encode(
        alt.X(
            "n_effective",
            axis=alt.Axis(
                title="effective amino acids", 
                values=[1, 2, 3, 4],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[0.9, 4.1])
        ),
        alt.Y(
            "effect",
            axis=alt.Axis(
                title=["site mean", "effect on cell entry"], 
                values=[-4,-3,-2,-1,0,1],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[-4.1,1.1])
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "n_effective",
        ], 
    ).properties(
        width=250,
        height=150,
    )
    
    subplots.append(curr_subplot)

# Create two rows of subregions
row1 = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],
    spacing=5,
)
row2 = alt.hconcat(
    subplots[4],
    subplots[5],
    subplots[6],
    subplots[7],
    spacing=5,
)

# Create final combined plot
natural_vs_func_effects = alt.vconcat(
    row1,
    row2,
    spacing=10,
    title="Natural variation vs functional effects for difference GPC regions",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

natural_vs_func_effects

Plot distributions of site mean effects on cell entry and effective amino acids for each region of GPC

In [None]:
# Plot score distrbutions for each region
DMS_scores = alt.Chart(
        functional_scores,
    ).mark_circle(opacity=0.25, size=75).encode(
    y=alt.Y(
        "region:N",
        title="GPC region",
        sort=["SSP", "GP1", "GP2", "TM", "CT", "\u03B1-DG binding sites", "LAMP1 binding sites", "N-glycosylation sites"],
        axis=alt.Axis(
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    x=alt.X(
        "effect:Q",
        title=["site mean", "effect on cell entry"],
        axis=alt.Axis(
            values=[-4,-3,-2,-1,0,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.1, 1.1])
    ),
    yOffset="jitter:Q",
    color=alt.Color(
        "region:N",
        scale=alt.Scale(
            domain=["SSP", "GP1", "GP2", "TM", "CT", "\u03B1-DG binding sites", "LAMP1 binding sites", "N-glycosylation sites"], 
            range=tol_muted_adjusted
        ),
    ).legend(None),
    tooltip=[
        "site",
        "wildtype",
        alt.Tooltip(
            "effect", format=".2f", title="effect on cell entry"
        ),
        alt.Tooltip(
            "n_effective", format=".2f", title="n effective amino acids"
        ),
    ],
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
).properties(
    width=250, 
    height=400,
)

# Plot score distrbutions for each region
natural_variation = alt.Chart(
        functional_scores, 
    ).mark_circle(opacity=0.25, size=75).encode(
    y=alt.Y(
        "region:N",
        title=None,
        sort=["SSP", "GP1", "GP2", "TM", "CT", "\u03B1-DG binding sites", "LAMP1 binding sites", "N-glycosylation sites"],
        axis=alt.Axis(
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            labels=False,
        ),
    ),
    x=alt.X(
        "n_effective:Q",
        title=["effective", "amino acids"],
        axis=alt.Axis(
            values=[1,2,3,4],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[0.9, 4.1])
    ),
    yOffset="jitter:Q",
    color=alt.Color(
        "region:N",
        scale=alt.Scale(
            domain=["SSP", "GP1", "GP2", "TM", "CT", "\u03B1-DG binding sites", "LAMP1 binding sites", "N-glycosylation sites"], 
            range=tol_muted_adjusted
        ),
    ).legend(None),
    tooltip=[
        "site",
        "wildtype",
        alt.Tooltip(
            "effect", format=".2f", title="effect on cell entry"
        ),
        alt.Tooltip(
            "n_effective", format=".2f", title="n effective amino acids"
        ),
    ],
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
).properties(
    width=250, 
    height=400,
)


# Create combined plot
combined_plot = alt.hconcat(
    DMS_scores,
    natural_variation,
    spacing=10,
    title="Natural variation vs functional effects for difference GPC regions",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
).configure_view(
    stroke=None
)

combined_plot