# plot_BLI_data.ipynb
Analyzes correlation between BLI binding measurements and DMS

- Written by Brendan Larsen

In [None]:
# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
BLI_corr_plot = None

binding_E2_file = None
binding_E3_file = None

In [None]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml

In [None]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if (
    os.getcwd()
    == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
):
    pass
    print("Already in correct directory")
else:
    os.chdir(
        "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
    )
    print("Setup in correct directory")

In [None]:
##hard paths in case don't want to run with snakemake
if BLI_corr_plot is None:
    print("loading hard paths")
    altair_config = "data/custom_analyses_data/theme.py"
    nipah_config = "nipah_config.yaml"
    
    # input files
    binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
    binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"

In [None]:
if altair_config:
    with open(altair_config, "r") as file:
        exec(file.read())

#with open(nipah_config) as f:
#    config = yaml.safe_load(f)

In [None]:
# import binding data
df_E2_filter = pd.read_csv(binding_E2_file)
display(df_E2_filter.head(3))
df_E3_filter = pd.read_csv(binding_E3_file)
display(df_E3_filter.head(3))

In [None]:
### load BLI data and merge with binding from DMS
BLI_df = pd.read_csv('data/custom_analyses_data/experimental_data/BLI_data.csv')

#merge
BLI_w_E2 = pd.merge(BLI_df,df_E2_filter[['site','wildtype','mutant','binding_mean']],on=['site','mutant'],how='left')
df = pd.merge(BLI_w_E2,df_E3_filter[['site','wildtype','mutant','binding_mean']],on=['site','mutant','wildtype'],how='left',suffixes=['_E2','_E3'])

#Make mutation column
df['mutation'] = (df['wildtype'].astype(str) + df['site'].astype(str) + df['mutant'].astype(str)).astype(object)
df.drop(['site','mutant','wildtype'],axis=1,inplace=True)

#Add unmutated information
new_row = {'B2_1': 0, 'B2_2': 0, 'B3_1': 0, 'B3_2': 0, 'binding_mean_E2': 0, 'binding_mean_E3': 0, 'mutation': 'Unmutated'}
new_df = pd.DataFrame([new_row])

#combine
df = pd.concat([df,new_df])
display(df)

In [None]:
df['B2_mean'] = df[['B2_1', 'B2_2']].mean(axis=1)
df['B3_mean'] = df[['B3_1', 'B3_2']].mean(axis=1)
df['B2_std'] = df[['B2_1','B2_2']].std(axis=1)
df['B3_std'] = df[['B3_1','B3_2']].std(axis=1)
df['B2_upper'] = df['B2_mean'] + df['B2_std']
df['B2_lower'] = df['B2_mean'] - df['B2_std']
df['B3_upper'] = df['B3_mean'] + df['B3_std']
df['B3_lower'] = df['B3_mean'] - df['B3_std']

display(df)

In [None]:
##### calculate R value:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    df["B2_mean"], df["binding_mean_E2"]
)
r_value_E2 = float(r_value)
print(r_value_E2)

slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    df["B3_mean"], df["binding_mean_E3"]
)
r_value_E3 = float(r_value)
print(r_value_E3)

In [None]:
# Sorting function to put 'WT' on top of the legend, followed by numerical order
def custom_sort_order(array):
    # Sort based on the numerical part in mutation strings, e.g., '530' in 'Q530F'
    def extract_number(mutation):
        num = re.search(r"\d+", mutation)
        return int(num.group()) if num else 0

    array = sorted(array, key=extract_number)

    # Move 'WT' to the beginning of the list
    if "Unmutated" in array:
        array.remove("Unmutated")
        array.insert(0, "Unmutated")
    return array


# Define the category10 colors manually
category10_colors = ["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]

# Adjust colors based on the unique mutations
colors = ["black"] + category10_colors[: len(df["mutation"].unique()) - 1]

# Create the Altair chart
corr_chart = alt.Chart(df, title=alt.Title("bEFNB2")).mark_point(size=125,filled=True,opacity=1).encode(
        x=alt.X(
            "binding_mean_E2:Q",
            title="DMS Binding",
            #scale=alt.Scale(domain=[-4, 2]),
            axis=alt.Axis(tickCount=3),
        ),
        y=alt.Y(
            "B2_mean",
            title="Binding (% change AUC)",
            #scale=alt.Scale(type="log", base=10),
            axis=alt.Axis(tickCount=4),
            #    format=".0e", tickCount=4
            #),  # Display in scientific notation
        ),
        color=alt.Color(
            "mutation",
            title="Variant",
            scale=alt.Scale(
                domain=custom_sort_order(df["mutation"].unique()), range=colors
            ),
        ),
)

min_effect_E2 = int(df["binding_mean_E2"].min())
max_mean_luciferase_E2 = int(df["B2_mean"].max())

text = (
    alt.Chart(
        {
            "values": [
                {
                    "x": min_effect_E2,
                    "y": max_mean_luciferase_E2,
                    "text": f"r = {r_value_E2:.2f}",
                }
            ]
        }
    )
    .mark_text(
        align="left",
        baseline="top",
        dx=-10,  # Adjust this for position
        dy=-20,  # Adjust this for position
        
    )
    .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
error = (
        alt.Chart(df)
        .mark_errorbar(opacity=1)
        .encode(
            x="binding_mean_E2",
            y=alt.Y("B2_lower",title='Binding (% change AUC)'),
            y2="B2_upper",
            color="mutation",
        )
)
# text
final_chart_E2 = corr_chart  + error + text
final_chart = final_chart_E2.properties(height=200,width=200)
final_chart




In [None]:
# Create the Altair chart
corr_chart_E3 = alt.Chart(df, title=alt.Title("bEFNB3")).mark_point(size=125,filled=True,opacity=1).encode(
        x=alt.X(
            "binding_mean_E3:Q",
            title="DMS Binding",
            scale=alt.Scale(domain=[-1, 1.5]),
            axis=alt.Axis(tickCount=3),
        ),
        y=alt.Y(
            "B3_mean",
            title="Binding (% max response)",
            #scale=alt.Scale(type="log", base=10),
            axis=alt.Axis(tickCount=4),
            #    format=".0e", tickCount=4
            #),  # Display in scientific notation
        ),
        color=alt.Color(
            "mutation",
            title="Variant",
            scale=alt.Scale(
                domain=custom_sort_order(df["mutation"].unique()), range=colors
            ),
        ),
)
min_effect_E2 = int(df["binding_mean_E3"].min())
max_mean_luciferase_E2 = int(df["B3_mean"].max())

text_E3 = (
    alt.Chart(
        {
            "values": [
                {
                    "x": min_effect_E2,
                    "y": max_mean_luciferase_E2,
                    "text": f"r = {r_value_E3:.2f}",
                }
            ]
        }
    )
    .mark_text(
        align="left",
        baseline="top",
        dx=-70,  # Adjust this for position
        dy=-30,  # Adjust this for position
        
    )
    .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)

error = (
        alt.Chart(df)
        .mark_errorbar(opacity=1)
        .encode(
            x="binding_mean_E3",
            y=alt.Y("B3_lower", title="Binding (% max response)"),
            y2="B3_upper",
            color="mutation",
        )
)
# text
final_chart_E3 = corr_chart_E3  + error + text_E3
final_chart_E3 = final_chart_E3.properties(height=200,width=200)
final_chart_E3

In [None]:
combined = alt.hconcat(final_chart_E2, final_chart_E3)
combined.display()
combined.save(BLI_corr_plot)