In [1]:
%cd ~/SSMuLA

/disk2/fli/SSMuLA


In [4]:
%load_ext autoreload
%autoreload 2
%load_ext blackcellmagic

In [2]:
# Data manipulation
import pandas as pd
import numpy as np

# Basic plotting
import holoviews as hv
from bokeh.io import export_svg
from bokeh.models import NumeralTickFormatter

# Imports for AUC/ROC
from sklearn.metrics import roc_curve, auc

from bokeh.themes.theme import Theme

import panel as pn
pn.config.comms = "vscode"


# Correlation

# Large data plotting
from holoviews.operation.datashader import rasterize

hv.extension('bokeh')

from SSMuLA.vis import JSON_THEME

hv.renderer('bokeh').theme = JSON_THEME


In [2]:
# Plot Hooks
def one_decimal_x(plot,element):
    plot.handles['plot'].xaxis[0].formatter = NumeralTickFormatter(format="0.0")

def one_decimal_y(plot,element):
    plot.handles['plot'].yaxis[0].formatter = NumeralTickFormatter(format="0.0")

def fixmargins(plot,element):
    plot.handles['plot'].min_border_right=30
    plot.handles['plot'].min_border_left=65
    plot.handles['plot'].min_border_top=20
    plot.handles['plot'].min_border_bottom=65
    plot.handles['plot'].outline_line_color='black'
    plot.handles['plot'].outline_line_alpha=1
    plot.handles['plot'].outline_line_width=1
    plot.handles['plot'].toolbar.autohide = True
    
hooks = [one_decimal_x, one_decimal_y, fixmargins]

In [3]:
# Import the imputed TrpB_data
TrpB_imputed_data = pd.read_csv(
     '../../../data/figure_data/4-site_imputed/20230828_KNN_imputed_TrpB.csv', 
    index_col=0
)

TrpB_imputed_data['imputed'] = True

# Import the measured TrpB_data
TrpB_measured_data = pd.read_csv(
    '../../../data/figure_data/4-site_merged_replicates/20230827/four-site_simplified_AA_data.csv',
)

TrpB_measured_data = TrpB_measured_data[TrpB_measured_data['# Stop'] == 0].copy().drop(columns=['# Stop'])

TrpB_measured_data['imputed'] = False

# Combine and sort the data
TrpB_data = pd.concat([TrpB_imputed_data, TrpB_measured_data]).sort_values('AAs').reset_index(drop=True)

# Add a column where no fitness values are below 0
TrpB_data['fitness (min 0)'] = TrpB_data['fitness'].apply(lambda x: max(0, x))

# Merge with zero-shot predictors
TrpB_data = TrpB_data.merge(
    pd.read_csv('../../../data/figure_data/zero-shot_data/Tm9D8s_4site_ZS+fitness.csv')[['Combo', 'Triad Score']]
    .rename(columns={'Combo': 'AAs', 'Triad Score': 'Triad score'})
).merge(
    pd.read_csv('../../../data/figure_data/zero-shot_data/Tm9D8s_Triad_w_AF.csv')[['Combo', 'Triad Score AF']].rename(columns={'Combo': 'AAs', 'Triad Score AF': 'Triad score AF'}),
    on='AAs'
).merge(
    pd.read_csv('../../../data/figure_data/zero-shot_data/ZeroShotPreds_EVmutation.csv')
    .rename(columns={'Combo': 'AAs', 'EvMutation': 'EVmutation'})
)

# Add protein column
TrpB_data['Protein'] = 'TrpB'

# Turn ddG to positive to match GB1 analysis from MLDE
TrpB_data['Triad score'] = TrpB_data['Triad score']*-1
TrpB_data['Triad score AF'] = TrpB_data['Triad score AF']*-1

# Make column ranking by Triad score
# TrpB_data['Triad score rank'] = TrpB_data['Triad score'].rank(ascending=False)
# TrpB_data['EVmutation rank'] = TrpB_data['EVmutation'].rank(ascending=False)

TrpB_data

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,fitness,active,imputed,fitness (min 0),Triad score,Triad score AF,EVmutation,Protein
0,AAAA,A,A,A,A,0.074455,True,False,0.074455,1425.39688,1502.55370,-13.970285,TrpB
1,AAAC,A,A,A,C,0.056314,True,False,0.056314,1421.17781,1497.41323,-16.150133,TrpB
2,AAAD,A,A,A,D,0.014342,False,False,0.014342,1422.67840,1492.81092,-14.703334,TrpB
3,AAAE,A,A,A,E,0.012914,False,False,0.012914,1419.72829,1492.18555,-14.950169,TrpB
4,AAAF,A,A,A,F,0.005161,False,False,0.005161,1425.82286,1490.52111,-16.150133,TrpB
...,...,...,...,...,...,...,...,...,...,...,...,...,...
159989,YYYS,Y,Y,Y,S,0.016578,False,False,0.016578,1427.95970,1496.76790,-23.215630,TrpB
159990,YYYT,Y,Y,Y,T,0.030715,False,False,0.030715,1425.21459,1491.91485,-26.169945,TrpB
159991,YYYV,Y,Y,Y,V,-0.000589,False,False,0.000000,1423.64236,1494.90492,-26.802021,TrpB
159992,YYYW,Y,Y,Y,W,-0.033119,False,False,0.000000,1425.79403,1487.24063,-26.912713,TrpB


In [4]:
# VDGV is parent

# Import the measured GB1 data
GB1_measured_data = pd.read_csv('../../../data/figure_data/GB1_data/GB1_Fitness.csv').rename(columns={'AAString': 'AAs'}).drop(columns=['Mutations'])

GB1_measured_data['imputed'] = False

# Import the imputed GB1 data
GB1_imputed_data = pd.read_excel('../../../data/figure_data/GB1_data/GB1_missing_data.xlsx').rename(columns={'Variants': 'AAs', 'Imputed fitness': 'Fitness'})

GB1_imputed_data['imputed'] = True

# Combine the data and add AA1 -> AA4 columns
GB1_data = pd.concat([GB1_measured_data, GB1_imputed_data], ignore_index=True).sort_values('AAs').reset_index(drop=True)

for i in range(4):
    GB1_data.insert(i+1, f'AA{i+1}', GB1_data['AAs'].apply(lambda x: x[i]))

# Get the Fitness/max column to scale the data the same way as the TrpB data
GB1_data['fitness'] = GB1_data['Fitness'] / GB1_data['Fitness'].max()
GB1_fit_min = 0.01

# Only set as active if they are not imputed and have a fitness above the minimum. This will prevent them from being included as starting points in the path analysis, but they will still appear in the graphs.
GB1_data['active'] = GB1_data.apply(lambda x: (x['Fitness'] > GB1_fit_min) & (x['imputed'] == False), axis=1)

GB1_data.sort_values('Fitness', ascending=False)

# Merge with zero-shot predictors
GB1_data = GB1_data.merge(
    pd.read_csv('../../../data/figure_data/zero-shot_data/GB1_4site_ZS+fitness.csv')[['Combo', 'EvMutation', 'Triad-FixedBb-dG']]
    .rename(columns={'Combo': 'AAs', 'Triad-FixedBb-dG': 'Triad score', 'EvMutation': 'EVmutation'})
)

# Add protein column
GB1_data['Protein'] = 'GB1'

GB1_data

  warn(msg)


Unnamed: 0,AAs,AA1,AA2,AA3,AA4,Fitness,imputed,fitness,active,EVmutation,Triad score,Protein
0,AAAA,A,A,A,A,1.611610,False,0.162574,True,-16.765606,151.58279,GB1
1,AAAG,A,A,A,G,0.105417,False,0.010634,True,-17.930454,149.05033,GB1
2,AAAH,A,A,A,H,0.000000,False,0.000000,False,-21.654984,149.56146,GB1
3,AAAI,A,A,A,I,0.000000,False,0.000000,False,-21.431052,154.16464,GB1
4,AAAL,A,A,A,L,0.095610,False,0.009645,True,-21.654984,152.57359,GB1
...,...,...,...,...,...,...,...,...,...,...,...,...
149356,YYYS,Y,Y,Y,S,0.004421,False,0.000446,False,-26.113303,128.44003,GB1
149357,YYYT,Y,Y,Y,T,0.021200,False,0.002139,True,-26.113303,126.02200,GB1
149358,YYYV,Y,Y,Y,V,0.041952,False,0.004232,True,-18.271280,121.93533,GB1
149359,YYYW,Y,Y,Y,W,0.009136,False,0.000922,False,-26.113303,103.35184,GB1


In [5]:
# Merge
df = pd.concat([TrpB_data, GB1_data]).reset_index(drop=True).set_index(['Protein', 'AAs']).drop(columns='Fitness')

df

Unnamed: 0_level_0,Unnamed: 1_level_0,AA1,AA2,AA3,AA4,fitness,active,imputed,fitness (min 0),Triad score,Triad score AF,EVmutation
Protein,AAs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
TrpB,AAAA,A,A,A,A,0.074455,True,False,0.074455,1425.39688,1502.55370,-13.970285
TrpB,AAAC,A,A,A,C,0.056314,True,False,0.056314,1421.17781,1497.41323,-16.150133
TrpB,AAAD,A,A,A,D,0.014342,False,False,0.014342,1422.67840,1492.81092,-14.703334
TrpB,AAAE,A,A,A,E,0.012914,False,False,0.012914,1419.72829,1492.18555,-14.950169
TrpB,AAAF,A,A,A,F,0.005161,False,False,0.005161,1425.82286,1490.52111,-16.150133
...,...,...,...,...,...,...,...,...,...,...,...,...
GB1,YYYS,Y,Y,Y,S,0.000446,False,False,,128.44003,,-26.113303
GB1,YYYT,Y,Y,Y,T,0.002139,True,False,,126.02200,,-26.113303
GB1,YYYV,Y,Y,Y,V,0.004232,True,False,,121.93533,,-18.271280
GB1,YYYW,Y,Y,Y,W,0.000922,False,False,,103.35184,,-26.113303


### Compute AUC/ROC for each predictor

In [6]:
hv.extension('bokeh')

In [7]:
ROC_plots = []

for protein in ['TrpB', 'GB1']:
    for predictor in ['EVmutation', 'Triad score']:
        y_true = df.loc[protein][df.loc[protein]['imputed'] == False]['active'].values
        y_score = df.loc[protein][df.loc[protein]['imputed'] == False][predictor].values
        pos_label = True

        # ROC curves
        fpr, tpr, thresholds = roc_curve(y_true, y_score, pos_label=pos_label)
        temp = pd.DataFrame({'False Positive Rate': fpr, 'True Positive Rate': tpr})

        ROC_plots.append(
            hv.Curve(
                temp, 
                kdims = ['False Positive Rate'],
                vdims=['True Positive Rate'],
                label=f'{protein} {predictor}'
            ).opts(
                frame_height=350,
                frame_width=350,
                xlim=(0,1),
                ylim=(0,1),
                hooks=[one_decimal_x, one_decimal_y, fixmargins],
                fontscale=1.3,
                color=hv.Cycle('Category20')
            )
        )

        roc_auc = auc(fpr, tpr)

        print(f'{protein} {predictor} ROC AUC: {roc_auc}')

ROC_plots.append(
    hv.Curve(
        ((0,1), (0,1)), 'False Positive Rate', 'True Positive Rate',
    ).opts(
        color='black', 
        line_dash='dashed', 
    ))

ROC = hv.Overlay(
    ROC_plots
).opts(
    frame_height=350,
    frame_width=350,
    legend_position='bottom_right',
    xlim=(0,1),
    ylim=(0,1),
    hooks=[one_decimal_x, one_decimal_y, fixmargins],
    fontscale=1.3,
    shared_axes=False,
)

ROC

TrpB EVmutation ROC AUC: 0.8172508993619487
TrpB Triad score ROC AUC: 0.7097054219807677
GB1 EVmutation ROC AUC: 0.637453831496426
GB1 Triad score ROC AUC: 0.6980726962355662


In [8]:
def determine_frac_active(df, protein, zs, score):
    passes = df.loc[protein][df.loc[protein][zs]>score]
    try:
        frac_active = sum(passes['active']) / len(passes)
    except ZeroDivisionError:
        frac_active = 0
        
    frac_library = len(passes) / len(df.loc[protein])
    mean_fitness = passes['fitness'].mean()
    max_fitness = passes['fitness'].max()

    return frac_active, frac_library, mean_fitness, max_fitness

(determine_frac_active(df, 'TrpB', 'EVmutation', -20),
determine_frac_active(df, 'GB1', 'EVmutation', -15))

((0.09583133684714902, 0.5739465229946124, 0.02948817585083339, 1.0),
 (0.5294282904902374,
  0.047663044569867634,
  0.03172176867918205,
  0.625550307361524))

In [9]:
def plot_samples(df, protein, zs):
    
    xs = np.linspace(min(df.loc[protein][zs]), max(df.loc[protein][zs]), 100)
    
    samples = np.array([
        determine_frac_active(df, protein, zs, thresh)
        for thresh in xs
    ]).T
    if protein == 'TrpB':
        p = rasterize(
            hv.Scatter(
                df.loc[protein], 
                kdims=zs, 
                vdims='fitness (min 0)'
            )
        ).opts(
            cmap='magma', 
            hooks=[fixmargins,one_decimal_y], 
            ylabel='fitness',
        )

    else:
        p = rasterize(
            hv.Scatter(
                df.loc[protein], 
                kdims=zs, 
                vdims='fitness'
            )
        ).opts(
            cmap='magma', 
            hooks=[fixmargins,one_decimal_y], 
            ylabel='fitness',
        )
    
    p = p * hv.Curve(
        (xs, samples[0]),
        label='fraction active'
    ).opts(color=hv.Cycle('Category10'))*hv.Curve(
        (xs, samples[2]),
        label='mean fitness'
    ).opts(line_dash='dashed', color=hv.Cycle('Category10'))
    
    p = p.opts(
        frame_height=120,
        frame_width=200,
        legend_position='top_left',
        ylabel='fraction or fitness',
        xlabel=f'{zs}',
        #title=protein,
    )
    
    return p.opts(fontscale=1.2)

In [10]:
TrpB_EVmutation = plot_samples(df, 'TrpB', 'EVmutation')
TrpB_Triad = plot_samples(df, 'TrpB', 'Triad score')
GB1_EVmutation = plot_samples(df, 'GB1', 'EVmutation')
GB1_Triad = plot_samples(df, 'GB1', 'Triad score')

In [11]:
TrpB_EVmutation

BokehModel(combine_events=True, render_bundle={'docs_json': {'5a7af116-7d04-4916-898c-7db5e2bbb2fe': {'defs': …

In [12]:
TrpB_Triad

BokehModel(combine_events=True, render_bundle={'docs_json': {'615183b9-62cb-4b79-8020-5866bbb14f68': {'defs': …

In [13]:
GB1_EVmutation

BokehModel(combine_events=True, render_bundle={'docs_json': {'1ff6b98f-128f-486a-830e-2a3a1d55a8e6': {'defs': …

In [14]:
GB1_Triad

BokehModel(combine_events=True, render_bundle={'docs_json': {'1397ebb1-df34-4fe4-88d5-052b1caca07a': {'defs': …

In [15]:
folder = '../../../data/output_figures/'

# ROC plot
plot = hv.render(ROC, backend='bokeh')
plot.output_backend = "svg"

filename=f'{folder}figure4a_ROC.svg'
export_svg(plot, filename=filename)

# Zero-shot predictor plots
plot=hv.render(TrpB_EVmutation, backend='bokeh')
plot.output_backend = "svg"

filename=f'{folder}figure4b_TrpB_EVmutation.svg'
export_svg(plot, filename=filename)

plot=hv.render(TrpB_Triad, backend='bokeh')
plot.output_backend = "svg"

filename=f'{folder}figure4b_TrpB_Triad.svg'
export_svg(plot, filename=filename)

plot=hv.render(GB1_EVmutation, backend='bokeh')
plot.output_backend = "svg"

filename=f'{folder}figure4c_GB1_EVmutation.svg'
export_svg(plot, filename=filename)

plot=hv.render(GB1_Triad, backend='bokeh')
plot.output_backend = "svg"

filename=f'{folder}figure4c_GB1_Triad.svg'
export_svg(plot, filename=filename)

['../../../data/output_figures/figure4c_GB1_Triad.svg']

In [16]:
import os
os.system('jupyter nbconvert --to html predictor_analysis.ipynb')

[NbConvertApp] Converting notebook predictor_analysis.ipynb to html
  {%- elif type == 'text/vnd.mermaid' -%}
[NbConvertApp] Writing 2664211 bytes to predictor_analysis.html


0