In [1]:
%cd ~/SSMuLA

/disk2/fli/SSMuLA


In [7]:
%load_ext blackcellmagic
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Note: Add something where I look into the epistasis based on the positions involved

In [3]:
# General imports
import glob
import os
import re
import pickle
import datetime

# Data manipulation
import pandas as pd
import numpy as np

# Basic plotting
import bokeh
from bokeh.io import export_svg
from bokeh.models import NumeralTickFormatter
from bokeh.io import output_notebook
output_notebook()


import panel as pn
pn.config.comms = "vscode"

# Large data plotting
import datashader as ds
from holoviews.operation.datashader import datashade, rasterize

# Making graphs
import networkx as nx
import matplotlib.pyplot as plt
import itertools
from tqdm.auto import tqdm
from multiprocessing import Pool
from operator import itemgetter

import holoviews as hv
hv.extension('bokeh')

from SSMuLA.vis import JSON_THEME

from bokeh.themes.theme import Theme

hv.renderer('bokeh').theme = JSON_THEME

In [4]:
from SSMuLA.aa_global import ALL_AAS

In [13]:
def make_new_sequence(input_seq, new_AA, position):
    seq_list = list(input_seq)
    seq_list[position] = new_AA
    return ''.join(seq_list)

def hamming(str1, str2):
    assert len(str1) == len(str2)

    distance = 0
    for i in range(len(str1)):
        if str1[i] != str2[i]:
            distance += 1
    return distance

## Plot hooks

In [14]:
# Plot Hooks
def one_decimal_x(plot,element):
    plot.handles['plot'].xaxis[0].formatter = NumeralTickFormatter(format="0.0")

def one_decimal_y(plot,element):
    plot.handles['plot'].yaxis[0].formatter = NumeralTickFormatter(format="0.0")

def fixmargins(plot,element):
    plot.handles['plot'].min_border_right=30
    plot.handles['plot'].min_border_left=65
    plot.handles['plot'].min_border_top=20
    plot.handles['plot'].min_border_bottom=65
    plot.handles['plot'].outline_line_color='black'
    plot.handles['plot'].outline_line_alpha=1
    plot.handles['plot'].outline_line_width=1
    plot.handles['plot'].toolbar.autohide = True

In [5]:
# Import the imputed TrpB_data
TrpB_imputed_data = pd.read_csv(
     '../../../data/figure_data/4-site_imputed/20230828_KNN_imputed_TrpB.csv', 
    index_col=0
)

TrpB_imputed_data['imputed'] = True

# Import the measured TrpB_data
TrpB_measured_data = pd.read_csv(
    '../../../data/figure_data/4-site_merged_replicates/20230827/four-site_simplified_AA_data.csv',
)

TrpB_measured_data = TrpB_measured_data[TrpB_measured_data['# Stop'] == 0].copy().drop(columns=['# Stop'])

TrpB_measured_data['imputed'] = False

# Combine and sort the data
TrpB_data = pd.concat([TrpB_imputed_data, TrpB_measured_data]).sort_values('AAs').reset_index(drop=True)

# Add a column where no fitness values are below 0
TrpB_data['fitness (min 0)'] = TrpB_data['fitness'].apply(lambda x: max(0, x))

TrpB_fit_min = TrpB_data[TrpB_data['active']]['fitness'].min()

TrpB_data

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,fitness,active,imputed,fitness (min 0)
0,AAAA,A,A,A,A,0.074455,True,False,0.074455
1,AAAC,A,A,A,C,0.056314,True,False,0.056314
2,AAAD,A,A,A,D,0.014342,False,False,0.014342
3,AAAE,A,A,A,E,0.012914,False,False,0.012914
4,AAAF,A,A,A,F,0.005161,False,False,0.005161
...,...,...,...,...,...,...,...,...,...
159995,YYYS,Y,Y,Y,S,0.016578,False,False,0.016578
159996,YYYT,Y,Y,Y,T,0.030715,False,False,0.030715
159997,YYYV,Y,Y,Y,V,-0.000589,False,False,0.000000
159998,YYYW,Y,Y,Y,W,-0.033119,False,False,0.000000


In [6]:
# VDGV is parent

# Import the measured GB1 data
GB1_measured_data = pd.read_csv('../../../data/figure_data/GB1_data/GB1_Fitness.csv').rename(columns={'AAString': 'AAs'}).drop(columns=['Mutations'])

GB1_measured_data['imputed'] = False

# Import the imputed GB1 data
GB1_imputed_data = pd.read_excel('../../../data/figure_data/GB1_data/GB1_missing_data.xlsx').rename(columns={'Variants': 'AAs', 'Imputed fitness': 'Fitness'})

GB1_imputed_data['imputed'] = True

# Combine the data and add AA1 -> AA4 columns
GB1_data = pd.concat([GB1_measured_data, GB1_imputed_data], ignore_index=True).sort_values('AAs').reset_index(drop=True)

for i in range(4):
    GB1_data.insert(i+1, f'AA{i+1}', GB1_data['AAs'].apply(lambda x: x[i]))

# Get the Fitness/max column to scale the data the same way as the TrpB data
GB1_data['Fitness/max'] = GB1_data['Fitness'] / GB1_data['Fitness'].max()
GB1_fit_min = 0.01 / GB1_data['Fitness'].max()
GB1_min_top9783 = GB1_data[GB1_data['imputed'] == False]['Fitness'].values[9782]

# Only set as active if they are not imputed and have a fitness above the minimum. This will prevent them from being included as starting points in the path analysis, but they will still appear in the graphs.
GB1_data['active'] = GB1_data.apply(lambda x: (x['Fitness/max'] > GB1_fit_min) & (x['imputed'] == False), axis=1)

GB1_data

  warn(msg)


Unnamed: 0,AAs,AA1,AA2,AA3,AA4,Fitness,imputed,Fitness/max,active
0,AAAA,A,A,A,A,1.611610,False,0.162574,True
1,AAAC,A,A,A,C,0.049726,True,0.005016,False
2,AAAD,A,A,A,D,0.011857,True,0.001196,False
3,AAAE,A,A,A,E,0.011416,True,0.001152,False
4,AAAF,A,A,A,F,0.029688,True,0.002995,False
...,...,...,...,...,...,...,...,...,...
159995,YYYS,Y,Y,Y,S,0.004421,False,0.000446,False
159996,YYYT,Y,Y,Y,T,0.021200,False,0.002139,True
159997,YYYV,Y,Y,Y,V,0.041952,False,0.004232,True
159998,YYYW,Y,Y,Y,W,0.009136,False,0.000922,False


In [7]:
assign_epistasis_dict = {
   ### starting variant #1 ###
   ('00', '01', '10', '11'): 'magnitude', # upwards
   ('00', '10', '01', '11'): 'magnitude', # upwards

   ('00', '01', '11', '10'): 'sign', # upwards
   ('00', '10', '11', '01'): 'sign', # upwards

   ('00', '11', '01', '10'): 'reciprocal sign', # upwards 
   ('00', '11', '10', '01'): 'reciprocal sign', # upwards
  
   ### starting variant #2 ###
   ('01', '00', '11', '10'): 'magnitude', # upwards
   ('10', '00', '11', '01'): 'magnitude', # upwards

   ('01', '00', '10', '11'): 'sign', # upwards
   ('10', '00', '01', '11'): 'sign', # upwards

   ('11', '00', '01', '10'): 'reciprocal sign', # downwards
   ('11', '00', '10', '01'): 'reciprocal sign', # downwards

   ### starting variant #3 ###
   ('01', '11', '00', '10'): 'magnitude', # downwards
   ('10', '11', '00', '01'): 'magnitude', # downwards

   ('11', '01', '00', '10'): 'sign', # downwards
   ('11', '10', '00', '01'): 'sign', # downwards

   ('01', '10', '00', '11'): 'reciprocal sign', # upwards
   ('10', '01', '00', '11'): 'reciprocal sign', # upwards

   ### starting variant #4 ###
   ('11', '01', '10', '00'): 'magnitude', # downwards
   ('11', '10', '01', '00'): 'magnitude', # downwards

   ('01', '11', '10', '00'): 'sign', # downwards
   ('10', '11', '01', '00'): 'sign', # downwards
    
   ('01', '10', '11', '00'): 'reciprocal sign', #downwards
   ('10', '01', '11', '00'): 'reciprocal sign', # downwards     
}

def pairwise_epistasis(seq_ab, data_dict):
    """
    Classifies the epistasis type between the given sequence and all other double mutants.

    Parameters
    ----------
    seq_ab : str
        The starting sequence to compare to all other double mutants.
    data_dict : dict
        A dictionary of all sequences and their fitness values.
    
    Returns
    -------
    epistasis_results : pd.DataFrame
        A dataframe with the following columns:
            - start_seq: the starting sequence
            - positions: the positions that were mutated
            - res1_AA: the amino acid that was mutated at position 1
            - res2_AA: the amino acid that was mutated at position 2
            - fit_ab: the fitness of the starting sequence
            - fit_Ab: the fitness of the sequence with a mutation at position 1
            - fit_aB: the fitness of the sequence with a mutation at position 2
            - fit_AB: the fitness of the sequence with mutations at both positions
            - epistasis_type: the type of epistasis between the starting sequence and the double mutant
    """
    
    position_orders = list(itertools.combinations(range(4),2))
    epsilon_dict = {}
    fit_ab = data_dict[seq_ab]

    for temp_order in position_orders:

        for AA1 in ALL_AAS:
            
            if AA1 == seq_ab[temp_order[0]]:
                pass

            else:
                seq_Ab = make_new_sequence(seq_ab, AA1, temp_order[0])
                fit_Ab = data_dict[seq_Ab]

                for AA2 in ALL_AAS:
                    if AA2 == seq_ab[temp_order[1]]:
                        pass

                    else:
                        seq_aB = make_new_sequence(seq_ab, AA2, temp_order[1])
                        seq_AB = make_new_sequence(seq_Ab, AA2, temp_order[1])

                        # get fitness scores
                        fit_aB = data_dict[seq_aB]
                        fit_AB = data_dict[seq_AB]

                        # sort the entries by fitness
                        sorted_fitness_list = sorted([('00', fit_ab), ('10', fit_Ab), ('01', fit_aB), ('11', fit_AB)], key=itemgetter(1))

                        # abstract the sequences only from the list
                        sorted_entries = tuple([x[0] for x in sorted_fitness_list])

                        epistasis_type = assign_epistasis_dict[sorted_entries]

                        epsilon_dict[(f'{temp_order[0]}{temp_order[1]}', AA1, AA2)] = [fit_ab, fit_Ab, fit_aB, fit_AB, epistasis_type]
    
    epistasis_results = pd.DataFrame(epsilon_dict).T.reset_index()
    epistasis_results.insert(0, 'start_seq', seq_ab)

    return epistasis_results

def get_epistasis_data(full_df, seq_col, fitness_col, n_jobs=1):
    
    # Get all nonzero sequences
    active_variants = full_df[full_df['active']][seq_col]

    # Conver the data to a dictionary for faster lookup
    data_dict = dict(zip(full_df[seq_col].values,full_df[fitness_col].values))
    pool_args = [(start_seq, data_dict) for start_seq in active_variants]

    with Pool(n_jobs) as pool:
        result = pool.starmap(pairwise_epistasis, tqdm(pool_args))

    all_epistasis_results = pd.concat(result).rename(
        columns={
            0:'fit_ab', 
            1:'fit_Ab', 
            2:'fit_aB', 
            3:'fit_AB', 
            4:'epistasis_type',
            'level_0': 'positions',
            'level_1': 'res1_AA',
            'level_2': 'res2_AA'
        }
    ).set_index(['start_seq', 'positions','res1_AA','res2_AA'])

    # Set column types
    all_epistasis_results = all_epistasis_results.astype({
        'fit_ab': 'float64',
        'fit_Ab': 'float64',
        'fit_aB': 'float64',
        'fit_AB': 'float64',
    })

    return all_epistasis_results

# In the parent background, how is the epistasis distributed between the position pairs?

In [8]:
TrpB_epistasis_df = get_epistasis_data(
    TrpB_data, 
    'AAs', 
    'fitness', 
    n_jobs=12
)
TrpB_epistasis_df

  0%|          | 0/9783 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,fit_ab,fit_Ab,fit_aB,fit_AB,epistasis_type
start_seq,positions,res1_AA,res2_AA,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAAA,01,C,C,0.074455,0.062987,0.041499,0.015102,magnitude
AAAA,01,C,D,0.074455,0.062987,0.012397,-0.000155,magnitude
AAAA,01,C,E,0.074455,0.062987,0.023336,-0.012486,magnitude
AAAA,01,C,F,0.074455,0.062987,0.148995,0.084898,magnitude
AAAA,01,C,G,0.074455,0.062987,0.071163,0.050405,magnitude
...,...,...,...,...,...,...,...,...
YYYG,23,W,S,0.086705,0.038344,0.016578,0.005757,magnitude
YYYG,23,W,T,0.086705,0.038344,0.030715,0.008323,magnitude
YYYG,23,W,V,0.086705,0.038344,-0.000589,0.003343,sign
YYYG,23,W,W,0.086705,0.038344,-0.033119,-0.012797,sign


In [9]:
GB1_epistasis_df = get_epistasis_data(
    GB1_data, 
    'AAs', 
    'Fitness/max',
    n_jobs=12
)

  0%|          | 0/34545 [00:00<?, ?it/s]

In [10]:
position_mapper = {
    'TrpB': {
        '0': '183',
        '1': '184',
        '2': '227',
        '3': '228'
    },
    'GB1': {
        '0': '39',
        '1': '40',
        '2': '41',
        '3': '54'
    }
} 

In [11]:
def filter_data(df, fit_min=0):
    # filter the data if you want!
    df = df[(df['fit_ab'] >= fit_min) & (df['fit_Ab'] >= fit_min) & (df['fit_aB'] >= fit_min) & (df['fit_AB'] >= fit_min)]

    return df

def calc_epsilon(row):
    epsilon = np.log(row['fit_AB'] / row['fit_ab']) - np.log(row['fit_Ab'] / row['fit_ab']) - np.log(row['fit_aB'] / row['fit_ab'])
    return epsilon

print('Filtering data...')
# Filter the data by fitness
filtered_TrpB_epistasis_df = filter_data(TrpB_epistasis_df, TrpB_fit_min).copy()
filtered_GB1_epistasis_df = filter_data(GB1_epistasis_df, GB1_fit_min).copy()

# Add a column to calculate fitness
print('Calculating epsilon for TrpB...')
filtered_TrpB_epistasis_df['epsilon'] = filtered_TrpB_epistasis_df.apply(calc_epsilon, axis=1)

print('Calculating epsilon for GB1...')
filtered_GB1_epistasis_df['epsilon'] = filtered_GB1_epistasis_df.apply(calc_epsilon, axis=1)

# Make a GB1_epistasis_df with the top 9783 variants
print('Filtering top 9783 variants for GB1...')
top9783_GB1_epistasis_df = filter_data(filtered_GB1_epistasis_df, GB1_min_top9783).copy()

Filtering data...
Calculating epsilon for TrpB...
Calculating epsilon for GB1...
Filtering top 9783 variants for GB1...


In [12]:
temp_TrpB_1 = filtered_TrpB_epistasis_df.reset_index()
temp_TrpB_1['protein'] = 'TrpB'

temp_TrpB_2 = temp_TrpB_1.copy()

temp_TrpB_1['filter'] = 'fit_min'
temp_TrpB_2['filter'] = 'top9783'

temp_GB1_1 = filtered_GB1_epistasis_df.reset_index()
temp_GB1_2 = top9783_GB1_epistasis_df.reset_index()

temp_GB1_1['protein'] = 'GB1'
temp_GB1_2['protein'] = 'GB1'

temp_GB1_1['filter'] = 'fit_min'
temp_GB1_2['filter'] = 'top9783'

combined_epistasis_df = pd.concat([temp_TrpB_1, temp_TrpB_2, temp_GB1_1, temp_GB1_2]).set_index(['filter', 'protein', 'start_seq', 'positions','res1_AA','res2_AA']).sort_index()
combined_epistasis_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,fit_ab,fit_Ab,fit_aB,fit_AB,epistasis_type,epsilon
filter,protein,start_seq,positions,res1_AA,res2_AA,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fit_min,GB1,AAAA,01,C,C,0.162574,0.338931,0.132318,0.074346,sign,-1.311150
fit_min,GB1,AAAA,01,C,D,0.162574,0.338931,0.195667,0.224941,sign,-0.595240
fit_min,GB1,AAAA,01,C,E,0.162574,0.338931,0.044238,0.104625,magnitude,0.126138
fit_min,GB1,AAAA,01,C,F,0.162574,0.338931,0.014666,0.133364,magnitude,1.472900
fit_min,GB1,AAAA,01,C,G,0.162574,0.338931,0.002910,0.096729,magnitude,2.768961
...,...,...,...,...,...,...,...,...,...,...,...
top9783,TrpB,YYKG,01,V,N,0.067835,0.091508,0.079191,0.439573,magnitude,1.414598
top9783,TrpB,YYKG,01,V,R,0.067835,0.091508,0.056512,0.054787,sign,-0.330349
top9783,TrpB,YYKG,01,V,T,0.067835,0.091508,0.056549,0.290510,sign,1.337185
top9783,TrpB,YYKG,02,E,Y,0.067835,0.069020,0.086705,0.054073,reciprocal sign,-0.489491


### Looking into the epsilon values by position pair
Top: The distribution of epsilon values for each pair where all values are greater than the fit min. 

Bottom: The distribution of epsilon values for where all four variants are part of the top 9783 non-imputed fitness values.

In [31]:
hv.extension('bokeh')

In [32]:
hv.Violin(
    combined_epistasis_df.loc['fit_min'],
    kdims=['positions', 'protein'],
    vdims=['epsilon'],
).opts(
    split='protein',
    frame_height=300,
    frame_width=300,
    fontscale=1.35,
    show_legend=True,
    inner=None,
    violin_width=0.8,
    legend_position='top',
    hooks=[fixmargins]
)+\
hv.Violin(
    combined_epistasis_df.loc['top9783'],
    kdims=['positions', 'protein'],
    vdims=['epsilon'],
).opts(
    split='protein',
    frame_height=300,
    frame_width=300,
    fontscale=1.5,
    show_legend=True,
    inner=None,
    violin_width=0.8,
    legend_position='top',
    hooks=[fixmargins],
)

In [33]:
def hook(plot,element):
    plot.handles['plot'].x_range.factors = [(position, epistasis) for position in ['01','02', '03', '12', '13', '23'] for epistasis in ['magnitude', 'sign', 'reciprocal sign']]

TrpB_plot = hv.Violin(
    combined_epistasis_df.loc['fit_min', 'TrpB'],
    kdims=['positions', 'epistasis_type'],
    vdims=['epsilon']
).opts(
    frame_height=350,
    frame_width=600,
    xrotation=90,
    violin_width=.8,
    fontscale=1.25,
    violin_color='epistasis_type',
    hooks=[fixmargins, one_decimal_y, hook],
    title='TrpB'
)

GB1_plot = hv.Violin(
    combined_epistasis_df.loc['fit_min', 'GB1'],
    kdims=['positions', 'epistasis_type'],
    vdims=['epsilon']
).opts(
    frame_height=350,
    frame_width=600,
    xrotation=90,
    violin_width=.8,
    fontscale=1.25,
    violin_color='epistasis_type',
    hooks=[fixmargins, one_decimal_y, hook],
    title='GB1'
)
TrpB_plot + GB1_plot

In [34]:
hv.Violin(
    combined_epistasis_df.loc[slice(None), 'GB1', slice(None), slice(None), slice(None), slice(None)],
    kdims=['positions', 'epistasis_type', 'filter'],
    vdims=['epsilon']
).opts(
    frame_height=350,
    frame_width=600,
    xrotation=90,
    violin_width=.9,
    inner=None,
    fontscale=1.25,
    split='filter',
    hooks=[fixmargins, one_decimal_y, hook],
    title='GB1',
    show_legend=True,
)

In [16]:
temp = combined_epistasis_df.copy()
temp['epistasis_type'] = temp['epistasis_type'].astype('category')

grouped_epistasis_df = pd.DataFrame(temp.groupby(['filter', 'protein', 'positions', 'start_seq', 'epistasis_type']).size()).rename(columns={0:'count'})

grouped_epistasis_df['total'] = grouped_epistasis_df.groupby(['filter','protein', 'positions', 'start_seq'])['count'].transform('sum')
grouped_epistasis_df['frac epistasis type'] = grouped_epistasis_df['count'] / grouped_epistasis_df['total']

grouped_epistasis_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,total,frac epistasis type
filter,protein,positions,start_seq,epistasis_type,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
fit_min,GB1,01,AAAA,magnitude,194,299,0.648829
fit_min,GB1,01,AAAA,reciprocal sign,3,299,0.010033
fit_min,GB1,01,AAAA,sign,102,299,0.341137
fit_min,GB1,01,AAAC,magnitude,0,0,
fit_min,GB1,01,AAAC,reciprocal sign,0,0,
...,...,...,...,...,...,...,...
top9783,TrpB,23,YYYT,reciprocal sign,0,0,
top9783,TrpB,23,YYYT,sign,0,0,
top9783,TrpB,23,YYYV,magnitude,0,0,
top9783,TrpB,23,YYYV,reciprocal sign,0,0,


In [35]:
temp = grouped_epistasis_df[grouped_epistasis_df['total'] > 0].copy().loc['fit_min']

def hook(plot,element):
    plot.handles['plot'].x_range.factors = [(position, epistasis) for position in ['01','02', '03', '12', '13', '23'] for epistasis in ['magnitude', 'sign', 'reciprocal sign']]

positional_epistasis_TrpB  = hv.Violin(
    temp.loc['TrpB'],
    kdims=['positions', 'epistasis_type'],
    vdims=['frac epistasis type']
).opts(
    frame_height=350,
    frame_width=600,
    xrotation=90,
    violin_width=.8,
    violin_color='epistasis_type',
    hooks=[fixmargins, one_decimal_y, hook],
    title='TrpB',
    fontscale=1.25,
    show_legend=True,
    legend_position='top'
)
positional_epistasis_GB1 = hv.Violin(
    temp.loc['GB1'],
    kdims=['positions', 'epistasis_type'],
    vdims=['frac epistasis type']
).opts(
    frame_height=350,
    frame_width=600,
    xrotation=90,
    violin_width=.8,
    violin_color='epistasis_type',
    fontscale=1.25,
    hooks=[fixmargins, one_decimal_y, hook],
    title='GB1',
    show_legend=True,
    legend_position='top'
)

positional_epistasis_TrpB + positional_epistasis_GB1

## Same plot but enforce fit_AB > fit_ab

In [36]:
temp = combined_epistasis_df[combined_epistasis_df['fit_AB'] > combined_epistasis_df['fit_ab']].copy()
temp['epistasis_type'] = temp['epistasis_type'].astype('category')

temp = pd.DataFrame(temp.groupby(['filter', 'protein', 'positions', 'start_seq', 'epistasis_type']).size()).rename(columns={0:'count'})

temp['total'] = temp.groupby(['filter','protein', 'positions', 'start_seq'])['count'].transform('sum')
temp['frac epistasis type'] = temp['count'] / temp['total']

temp = temp[temp['total'] > 0].copy().loc['fit_min']

def hook(plot,element):
    plot.handles['plot'].x_range.factors = [(position, epistasis) for position in ['01','02', '03', '12', '13', '23'] for epistasis in ['magnitude', 'sign', 'reciprocal sign']]

hv.Violin(
    temp.loc['TrpB'],
    kdims=['positions', 'epistasis_type'],
    vdims=['frac epistasis type']
).opts(
    frame_height=350,
    frame_width=600,
    xrotation=90,
    violin_width=.8,
    violin_color='epistasis_type',
    hooks=[fixmargins, one_decimal_y, hook],
    title='TrpB',
    fontscale=1.5
)+\
hv.Violin(
    temp.loc['GB1'],
    kdims=['positions', 'epistasis_type'],
    vdims=['frac epistasis type']
).opts(
    frame_height=350,
    frame_width=600,
    xrotation=90,
    violin_width=.8,
    violin_color='epistasis_type',
    fontscale=1.5,
    hooks=[fixmargins, one_decimal_y, hook],
    title='GB1',
)

In [37]:
def hook(plot,element):
    plot.handles['plot'].x_range.factors = [(position, epistasis) for position in ['01','02', '03', '12', '13', '23'] for epistasis in ['magnitude', 'sign', 'reciprocal sign']]

plots = []

for variant in ['VFVS','VIVS','VIVG','VIKG','AIKG']:
    
    temp = combined_epistasis_df.loc['fit_min', 'TrpB', slice(None), slice(None), slice(None), slice(None)].copy()

    plot = hv.Violin(
        temp.loc[variant],
        kdims=['positions', 'epistasis_type'],
        vdims=['epsilon']
    ).opts(
        frame_height=250,
        frame_width=400,
        xrotation=90,
        violin_width=.8,
        violin_color='epistasis_type',
        hooks=[fixmargins, one_decimal_y, hook],
        title=variant,
    )

    plots.append(plot)

    temp['epistasis_type'] = temp['epistasis_type'].astype('category')
    temp = pd.DataFrame(temp.groupby(['start_seq', 'positions',  'epistasis_type']).size()).rename(columns={0:'count'})

    temp['total'] = temp.groupby(['start_seq', 'positions'])['count'].transform('sum')
    temp['frac epistasis type'] = temp['count'] / temp['total']

    temp = temp[temp['total'] > 0].copy()

    plot = hv.Bars(
        temp.loc[variant],
        kdims=['positions', 'epistasis_type'],
        vdims=['frac epistasis type']
    ).opts(
        frame_height=200,
        frame_width=350,
        xrotation=90,
        hooks=[fixmargins, one_decimal_y, hook],
        title=variant,
    )

    plots.append(plot)

hv.Layout(plots).cols(2)

# Get DataFrame with quartiles

In [20]:
def add_quartiles(df):

    df = df.copy()

    # Add quartiles
    temp = df.reset_index()[['start_seq','fit_ab']].drop_duplicates().copy()

    cutoffs = temp['fit_ab'].quantile([0.25, 0.5, 0.75, 1]).to_dict()
    quart_dict = {cutoff: f'Q{i+1}' for i, cutoff in enumerate(cutoffs.values())}

    def assign_quartile(x):
        for cutoff in quart_dict:
            if x <= cutoff:
                return quart_dict[cutoff]

    df['quartile'] = df['fit_ab'].apply(assign_quartile)

    return df

def add_quartiles_and_combine(TrpB_df, GB1_df):

    # Copy the dataframes
    TrpB_df = add_quartiles(TrpB_df.copy()).copy()
    GB1_df = add_quartiles(GB1_df.copy()).copy()

    TrpB_df['protein'] = 'TrpB'
    GB1_df['protein'] = 'GB1'

    combined_df = pd.concat([TrpB_df, GB1_df], axis=0).reset_index()
    combined_df = combined_df.set_index(['protein', 'start_seq', 'positions', 'res1_AA', 'res2_AA', 'quartile']).sort_index()

    return combined_df

In [21]:
quartile_epistasis_df = add_quartiles_and_combine(TrpB_epistasis_df, GB1_epistasis_df)
quartile_epistasis_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,fit_ab,fit_Ab,fit_aB,fit_AB,epistasis_type
protein,start_seq,positions,res1_AA,res2_AA,quartile,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GB1,AAAA,01,C,C,Q4,0.162574,0.338931,0.132318,0.074346,sign
GB1,AAAA,01,C,D,Q4,0.162574,0.338931,0.195667,0.224941,sign
GB1,AAAA,01,C,E,Q4,0.162574,0.338931,0.044238,0.104625,magnitude
GB1,AAAA,01,C,F,Q4,0.162574,0.338931,0.014666,0.133364,magnitude
GB1,AAAA,01,C,G,Q4,0.162574,0.338931,0.002910,0.096729,magnitude
...,...,...,...,...,...,...,...,...,...,...
TrpB,YYYG,23,W,S,Q2,0.086705,0.038344,0.016578,0.005757,magnitude
TrpB,YYYG,23,W,T,Q2,0.086705,0.038344,0.030715,0.008323,magnitude
TrpB,YYYG,23,W,V,Q2,0.086705,0.038344,-0.000589,0.003343,sign
TrpB,YYYG,23,W,W,Q2,0.086705,0.038344,-0.033119,-0.012797,sign


In [22]:
filtered_quartile_epistasis_df = add_quartiles_and_combine(filtered_TrpB_epistasis_df, filtered_GB1_epistasis_df)
filtered_quartile_epistasis_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,fit_ab,fit_Ab,fit_aB,fit_AB,epistasis_type,epsilon
protein,start_seq,positions,res1_AA,res2_AA,quartile,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GB1,AAAA,01,C,C,Q4,0.162574,0.338931,0.132318,0.074346,sign,-1.311150
GB1,AAAA,01,C,D,Q4,0.162574,0.338931,0.195667,0.224941,sign,-0.595240
GB1,AAAA,01,C,E,Q4,0.162574,0.338931,0.044238,0.104625,magnitude,0.126138
GB1,AAAA,01,C,F,Q4,0.162574,0.338931,0.014666,0.133364,magnitude,1.472900
GB1,AAAA,01,C,G,Q4,0.162574,0.338931,0.002910,0.096729,magnitude,2.768961
...,...,...,...,...,...,...,...,...,...,...,...
TrpB,YYKG,01,V,N,Q1,0.067835,0.091508,0.079191,0.439573,magnitude,1.414598
TrpB,YYKG,01,V,R,Q1,0.067835,0.091508,0.056512,0.054787,sign,-0.330349
TrpB,YYKG,01,V,T,Q1,0.067835,0.091508,0.056549,0.290510,sign,1.337185
TrpB,YYKG,02,E,Y,Q1,0.067835,0.069020,0.086705,0.054073,reciprocal sign,-0.489491


In [23]:
# Grouped by quartile and filtered
temp = filtered_quartile_epistasis_df.copy()

temp['epistasis_type'] = temp['epistasis_type'].astype('category')

grouped_filtered_quartile_df = pd.DataFrame(temp.groupby(['protein', 'start_seq', 'quartile', 'epistasis_type']).size()).rename(columns={0:'count'})

grouped_filtered_quartile_df['total'] = grouped_filtered_quartile_df.groupby(['protein', 'start_seq', 'quartile'])['count'].transform('sum')
grouped_filtered_quartile_df['frac epistasis type'] = grouped_filtered_quartile_df['count'] / grouped_filtered_quartile_df['total']

grouped_filtered_quartile_df = grouped_filtered_quartile_df[grouped_filtered_quartile_df['total'] > 0].copy()

grouped_filtered_quartile_df.index.names = ['protein', 'start_seq', 'quartile', 'epistasis type']

# Grouped by quartile and unfiltered
## SI Figure where I don't filter the data
temp = quartile_epistasis_df.copy()

temp['epistasis_type'] = temp['epistasis_type'].astype('category')

grouped_unfiltered_quartile_df = pd.DataFrame(temp.groupby(['protein', 'start_seq', 'quartile', 'epistasis_type']).size()).rename(columns={0:'count'})

grouped_unfiltered_quartile_df['total'] = grouped_unfiltered_quartile_df.groupby(['protein', 'start_seq', 'quartile'])['count'].transform('sum')
grouped_unfiltered_quartile_df['frac epistasis type'] = grouped_unfiltered_quartile_df['count'] / grouped_unfiltered_quartile_df['total']

grouped_unfiltered_quartile_df = grouped_unfiltered_quartile_df[grouped_unfiltered_quartile_df['total'] > 0].copy()

grouped_unfiltered_quartile_df.index.names = ['protein', 'start_seq', 'quartile', 'epistasis type']

In [38]:
def hook(plot,element):
    plot.handles['plot'].x_range.factors = [(epistasis, quartile) for epistasis in ['magnitude', 'sign', 'reciprocal sign'] for quartile in ['Q1', 'Q2', 'Q3', 'Q4']]

figure_2b = hv.Violin(
    grouped_filtered_quartile_df.sort_index(ascending=False),
    kdims=['protein','epistasis type', 'quartile'],
    vdims=['frac epistasis type']
).opts(
    split='protein',
    frame_height=250,
    frame_width=600,
    violin_width=1.5,
    fontscale=1.3,
    hooks=[fixmargins, one_decimal_y, hook],
    show_legend=True,
    inner=None,
    ylabel='fraction of epistasis type'
)

figure_2b + hv.Violin(
    grouped_unfiltered_quartile_df.sort_index(ascending=False),
    kdims=['protein','epistasis type', 'quartile'],
    vdims=['frac epistasis type']
).opts(
    split='protein',
    frame_height=250,
    frame_width=600,
    violin_width=1.5,
    fontscale=1.3,
    hooks=[fixmargins, one_decimal_y, hook],
    show_legend=True,
    inner=None,
    ylabel='fraction of epistasis type'
)

### Export figure for paper

In [39]:
positional_epistasis_TrpB

In [40]:
positional_epistasis_GB1

In [41]:
folder = '../../../data/output_figures/'

plot=hv.render(figure_2b, backend='bokeh')
plot.output_backend = "svg"

filename=f'{folder}figure2b_pairwise_epistasis.svg'
export_svg(plot, filename=filename)

plot=hv.render(positional_epistasis_TrpB, backend='bokeh')
plot.output_backend = "svg"

filename=f'{folder}figure2c_trpb_positional_epistasis.svg'
export_svg(plot, filename=filename)

plot=hv.render(positional_epistasis_GB1, backend='bokeh')
plot.output_backend = "svg"

filename=f'{folder}figure2d_GB1_positional_epistasis.svg'
export_svg(plot, filename=filename)

ERROR:bokeh.core.validation.check:E-1006 (NON_MATCHING_DATA_SOURCES_ON_LEGEND_ITEM_RENDERERS): LegendItem.label is a field, but renderer data sources don't match: LegendItem(id='11196', ...)


['../../../data/output_figures/figure2d_GB1_positional_epistasis.svg']

## Make the violins as two BoxWhisker plots

In [42]:
def hook(plot,element):
    plot.handles['plot'].x_range.factors = [(epistasis, quartile) for epistasis in ['magnitude', 'sign', 'reciprocal sign'] for quartile in ['Q1', 'Q2', 'Q3', 'Q4']]

temp = grouped_filtered_quartile_df.reset_index()

hv.BoxWhisker(
    temp[temp['protein'] == 'TrpB'],
    kdims=['epistasis type', 'quartile'],
    vdims=['frac epistasis type']
).opts(
    frame_height=250,
    frame_width=600,
    fontscale=1.3,
    hooks=[fixmargins, one_decimal_y, hook],
    show_legend=True,
    ylabel='fraction of epistasis type'
)+\
hv.BoxWhisker(
    temp[temp['protein'] == 'GB1'],
    kdims=['epistasis type', 'quartile'],
    vdims=['frac epistasis type']
).opts(
    frame_height=250,
    frame_width=600,
    fontscale=1.3,
    hooks=[fixmargins, one_decimal_y, hook],
    show_legend=True,
    ylabel='fraction of epistasis type',
    box_fill_color=bokeh.palettes.Category10[3][1],
)



In [43]:
def hook(plot,element):
    plot.handles['plot'].x_range.factors = [(epistasis, quartile) for epistasis in ['magnitude', 'sign', 'reciprocal sign'] for quartile in ['Q1', 'Q2', 'Q3', 'Q4']]

temp = grouped_unfiltered_quartile_df.reset_index()

hv.BoxWhisker(
    temp[temp['protein'] == 'TrpB'],
    kdims=['epistasis type', 'quartile'],
    vdims=['frac epistasis type']
).opts(
    frame_height=250,
    frame_width=600,
    fontscale=1.3,
    hooks=[fixmargins, one_decimal_y, hook],
    show_legend=True,
    ylabel='fraction of epistasis type'
)+\
hv.BoxWhisker(
    temp[temp['protein'] == 'GB1'],
    kdims=['epistasis type', 'quartile'],
    vdims=['frac epistasis type']
).opts(
    frame_height=250,
    frame_width=600,
    fontscale=1.3,
    hooks=[fixmargins, one_decimal_y, hook],
    show_legend=True,
    ylabel='fraction of epistasis type',
    box_fill_color=bokeh.palettes.Category10[3][1],
)



## Save as HTML

In [30]:
os.system('jupyter nbconvert --to html pairwise_epistasis.ipynb')

[NbConvertApp] Converting notebook pairwise_epistasis.ipynb to html
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
[NbConvertApp] Writing 6471556 bytes to pairwise_epistasis.html


0