In [3]:
%cd ~/SSMuLA/

/disk2/fli/SSMuLA


In [1]:
# General imports
import glob
import os
import re
import pickle
import datetime

# Data manipulation
#import growth_analysis as ga
import pandas as pd
#from multiprocesspandas import applyparallel
import numpy as np
from sklearn.impute import KNNImputer

# Basic plotting
import holoviews as hv
import bokeh
from bokeh.io import export_svg
from bokeh.models import NumeralTickFormatter

from bokeh.themes.theme import Theme
# theme = Theme(
#     json=ga.PLOT_STYLE
# )
#hv.renderer('bokeh').theme = theme

import panel as pn
pn.config.comms = "vscode"

# Large data plotting
#import datashader as ds
#from holoviews.operation.datashader import datashade, rasterize

# Making graphs
import networkx as nx
import matplotlib.pyplot as plt
import itertools
import tqdm
from multiprocessing import Pool
from operator import itemgetter

hv.extension('bokeh')

In [16]:
# Import the measured + imputed TrpB_data
TrpB_data = pd.read_csv(
     'data/TrpB/fitness_landscape/TrpB4.csv', 
    index_col=0
)

TrpB_data = TrpB_data.sort_values('AAs').reset_index()

# add rank column
TrpB_data['rank'] = TrpB_data['fitness'].rank(ascending=False)

TrpB_data['scaled_fitness'] = TrpB_data['fitness'].apply(lambda x: np.log(x))

TrpB_fit_min = TrpB_data[TrpB_data['active']]['fitness'].min()

TrpB_data, TrpB_fit_min

(         AAs AA1 AA2 AA3 AA4  # Stop   fitness  active      rank  \
 0       ****   *   *   *   *     4.0 -0.001038   False  144721.0   
 1       ***A   *   *   *   A     3.0 -0.039819   False  189008.0   
 2       ***C   *   *   *   C     3.0 -0.001858   False  147087.0   
 3       ***E   *   *   *   E     3.0 -0.072498   False  192552.0   
 4       ***F   *   *   *   F     3.0  0.010546   False   98274.0   
 ...      ...  ..  ..  ..  ..     ...       ...     ...       ...   
 193165  YYYS   Y   Y   Y   S     0.0  0.016578   False   69518.0   
 193166  YYYT   Y   Y   Y   T     0.0  0.030715   False   29764.0   
 193167  YYYV   Y   Y   Y   V     0.0 -0.000589   False  143365.0   
 193168  YYYW   Y   Y   Y   W     0.0 -0.033119   False  186911.0   
 193169  YYYY   Y   Y   Y   Y     0.0 -0.023758   False  181756.0   
 
         scaled_fitness  
 0                  NaN  
 1                  NaN  
 2                  NaN  
 3                  NaN  
 4            -4.551975  
 ...          

In [12]:
# VDGV is parent

# Import the measured GB1 data
GB1_data = pd.read_csv("data/GB1/fitness_landscape/GB1.csv").rename(columns={'Variants': 'AAs'})

# add AA1 -> AA4 columns (create columns for each mutated position)
for i in range(4):
     GB1_data.insert(i+1, f'AA{i+1}', GB1_data['AAs'].apply(lambda x: x[i]))

# Get the Fitness/max column to scale the data the same way as the TrpB data
GB1_data['Fitness/max'] = GB1_data['Fitness'] / GB1_data['Fitness'].max()
GB1_fit_min = 0.01 / GB1_data['Fitness'].max()
# GB1_min_top9783 = GB1_data[GB1_data['imputed'] == False]['Fitness'].values[9782] # wouldn't we want to sort by fitness first

# Only set as active if they are not imputed and have a fitness above the minimum. This will prevent them from being included as starting points in the path analysis, but they will still appear in the graphs.
GB1_data['active'] = GB1_data.apply(lambda x: (x['Fitness/max'] > GB1_fit_min), axis=1)

# add rank column
GB1_data['rank'] = GB1_data['Fitness'].rank(ascending=False)


GB1_data['scaled_fitness'] = GB1_data['Fitness'].apply(lambda x: np.log(x))

GB1_data

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,HD,Count input,Count selected,Fitness,Fitness/max,active,rank,scaled_fitness
0,VDGV,V,D,G,V,0,92735,338346,1.000000,0.114130,True,3644.0,0.000000
1,ADGV,A,D,G,V,1,34,43,0.061910,0.007066,True,13482.0,-2.782079
2,CDGV,C,D,G,V,1,850,641,0.242237,0.027646,True,8295.0,-1.417838
3,DDGV,D,D,G,V,1,63,63,0.006472,0.000739,False,47275.0,-5.040256
4,EDGV,E,D,G,V,1,841,190,0.032719,0.003734,True,17053.0,-3.419794
...,...,...,...,...,...,...,...,...,...,...,...,...,...
149356,YYYR,Y,Y,Y,R,4,203,1,0.001350,0.000154,False,112286.0,-6.607529
149357,YYYS,Y,Y,Y,S,4,186,3,0.004421,0.000505,False,61661.5,-5.421457
149358,YYYT,Y,Y,Y,T,4,181,14,0.021200,0.002420,True,21029.5,-3.853763
149359,YYYW,Y,Y,Y,W,4,30,1,0.009136,0.001043,False,36917.0,-4.695520


In [13]:
# Copied from  Kadina's DE_simulations.py
def make_new_sequence(input_seq, new_AA, position):
    seq_list = list(input_seq)
    seq_list[position] = new_AA
    return ''.join(seq_list)

def ecdf_transform(data):
    return data.rank(method="first") / len(data)

def simulate_single_step_DE(data, seq_col, fitness_col, n_sites=4): # single-step SSM 

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))
    
    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))
    
    active_AAs = data[data['active']][seq_col].values
    
    position_orders = list(itertools.permutations(range(n_sites)))
    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_array = np.empty(len(active_AAs)*len(position_orders))
    fitness_dict = {}
    
    for i, start_seq in tqdm.tqdm(enumerate(active_AAs)):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]
        
        # Loop through all possible orders of positions
        for j, temp_order in enumerate(position_orders):
            
            best_seq = start_seq
            best_fitness = start_fitness
            
            # Loop through the positions
            for pos in temp_order:
                
                # Try all possible mutations at the position
                for AA in AA_list:
                    temp_seq = make_new_sequence(best_seq, AA, pos)
                    
                    # Use Try/Except in case the AA combo doesn't exist in the dataframe
                    try:
                        temp_fitness = data_dict[temp_seq]
                    except:
                        temp_fitness = 0
                    
                    # If this sequence is better than any previous then keep it
                    if temp_fitness > best_fitness:
                        best_seq = temp_seq
                        best_fitness = temp_fitness
                    else:
                        pass
                
            fitness_array[len(position_orders)*i+j] = best_fitness
            fitness_dict[(start_seq, temp_order)] = [start_fitness, best_seq, best_fitness]

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'level_0':'start_seq', 'level_1':'order', 0:'start_fitness', 1:'final_seq', 2:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

   # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])
    
    return (fitness_array,output_df)

def simulate_simple_SSM_recomb_DE(data, seq_col, fitness_col, n_sites=4): # SSM recomb

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))
    
    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))
    
    active_AAs = data[data['active']][seq_col].values
    
    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_dict = {}
    
    for start_seq in tqdm.tqdm(active_AAs):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]
        
        top_SSM_variants = {}
        
        # Loop through the positions
        for pos in range(n_sites):

            best_seq = start_seq
            best_fitness = start_fitness
            
            # Try all possible mutations at the position to find the best
            for AA in AA_list:
                temp_seq = make_new_sequence(start_seq, AA, pos)
                
                # Use Try/Except in case the AA combo doesn't exist in the dataframe
                try:
                    temp_fitness = data_dict[temp_seq]
                except:
                    temp_fitness = 0
                
                # If this sequence is better than any previous then keep it
                if temp_fitness > best_fitness:
                    best_seq = temp_seq
                    best_fitness = temp_fitness
                else:
                    pass

            # collect the best variant for this position
            top_SSM_variants[pos] = best_seq

        # simple recombination
        recomb_seq = ''.join([top_SSM_variants[pos][pos] for pos in range(n_sites)])
        try:
            recomb_fitness = data_dict[recomb_seq]
        except:
            recomb_fitness = 0

        best_seq = start_seq
        best_fitness = start_fitness

        # check if the recombined sequence is better than the starting sequence as well as all of the SSM variants. Return the best one of these.
        if recomb_fitness > best_fitness:
            best_seq = recomb_seq
            best_fitness = recomb_fitness

        for SSM_seq in top_SSM_variants.values():
            SSM_fit = data_dict[SSM_seq]
            if SSM_fit > best_fitness:
                best_seq = SSM_seq
                best_fitness = SSM_fit

        fitness_dict[start_seq] = [start_fitness, tuple(top_SSM_variants.values()), best_seq, best_fitness]

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'index':'start_seq', 0:'start_fitness', 1:'top_SSM_variants', 2:'final_seq', 3:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

    # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])
    
    return output_df

def try_start_seq(start_seq, data_dict, AA_list, n_sites, N):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]

        SSM_data = {}
        SSM_to_compare = {}
        
        # Loop through the positions to collect SSM data
        for pos in range(n_sites):

            SSM_data[pos] = {}
            SSM_to_compare[pos] = {}
            
            # Try all possible mutations at the position to find the best
            for AA in AA_list:
                temp_seq = make_new_sequence(start_seq, AA, pos)
                
                # Use Try/Except in case the AA combo doesn't exist in the dataframe
                try:
                    temp_fitness = data_dict[temp_seq]
                except:
                    temp_fitness = 0
                
                SSM_data[pos][AA] = temp_fitness
                SSM_to_compare[pos][temp_seq] = temp_fitness

        all_possible_combos = [''.join(x) for x in list(itertools.product('ACDEFGHIKLMNPQRSTVWY', repeat=n_sites))]

        calculated_improvement = {}

        for combo in all_possible_combos:
            calculated_improvement[combo] = np.product([SSM_data[i][combo[i]] / start_fitness for i in range(n_sites)])

        top_predicted = pd.DataFrame(calculated_improvement.items(), columns=['AAs', 'calculated improvement']).sort_values('calculated improvement', ascending=False).head(N)['AAs'].values

        best_seq = start_seq
        best_fitness = start_fitness

        for variant_seq in top_predicted:
            
            try:
                variant_fit = data_dict[variant_seq]
            except:
                variant_fit = 0

            if variant_fit > best_fitness:
                best_seq = variant_seq
                best_fitness = variant_fit

        # add a step where I also look at all the SSM variants and see if any of them are better than the top predicted
        for pos,temp_fit_dict in SSM_data.items():
            for SSM_seq,SSM_fit in temp_fit_dict.items():

                if SSM_fit > best_fitness:
                    best_seq = SSM_seq
                    best_fitness = SSM_fit
        
        return (start_fitness, best_seq, best_fitness)

def sample_SSM_test_top_N(data, seq_col, fitness_col, n_sites=4, N=96, max_samples=None, n_jobs=1): # SSM predict top 96

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))
    
    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))
    
    active_AAs = data[data['active']].sample(frac=1)['AAs'].values

    if max_samples is not None and type(max_samples) == int:
        active_AAs = active_AAs[:max_samples]
    
    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_dict = {}

    # Get the multiprocessing args
    pool_args = [(start_seq, data_dict, AA_list, n_sites, N) for start_seq in active_AAs]

    with Pool(n_jobs) as pool:
        results = pool.starmap(try_start_seq, tqdm.tqdm(pool_args))

    fitness_dict = {active_AAs[i]:results[i] for i in range(len(active_AAs))}

    # for start_seq in tqdm.tqdm(active_AAs):
    #     fitness_dict[start_seq] = try_start_seq(start_seq, data_dict, AA_list, n_sites, N)

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'index':'start_seq', 0:'start_fitness', 1:'final_seq', 2:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

    # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])
    
    return output_df

def simulate_iterative_SM(data, seq_col, fitness_col, n_sites=4):

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))
    
    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))
    
    active_AAs = data[data['active']]['AAs'].values
    
    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_array = np.empty(len(active_AAs)*1)
    fitness_dict = {}
    
    for i, start_seq in tqdm.tqdm(enumerate(active_AAs)):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]
        
        best_seq = start_seq
        best_fitness = start_fitness

        # Loop through all possible orders of positions
        remaining_positions = list(range(n_sites))
        temp_order = []

        for j in range(n_sites):
            # Loop through the positions
            previous_best_seq = best_seq
            found_improvement = False
            for pos in remaining_positions:
                
                # Try all possible mutations at the position
                for AA in AA_list:
                    temp_seq = make_new_sequence(previous_best_seq, AA, pos)
                    
                    # Use Try/Except in case the AA combo doesn't exist in the dataframe
                    try:
                        temp_fitness = data_dict[temp_seq]
                    except:
                        temp_fitness = 0
                    
                    # If this sequence is better than any previous then keep it
                    if temp_fitness > best_fitness:
                        best_seq = temp_seq
                        best_fitness = temp_fitness
                        best_site = pos
                        found_improvement = True
                    else:
                        pass
                    
            if found_improvement:
                remaining_positions.remove(best_site)
                temp_order.append(best_site)
            else:
                #finish if there are no more beneficial mutations
                break
            #print(start_seq, best_seq, temp_order)
        
        # print(best_seq)
        temp_order = tuple(temp_order)
        fitness_array[i] = best_fitness
        fitness_dict[(start_seq, temp_order)] = [start_fitness, best_seq, best_fitness]

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'level_0':'start_seq', 'level_1':'order', 0:'start_fitness', 1:'final_seq', 2:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

    # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])
    
    return (fitness_array,output_df)

In [9]:
def simulate_double_step_DE(data, seq_col, fitness_col, n_sites=4): # double-step SSM 

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))

    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))

    active_AAs = data[data['active']][seq_col].values

    groups = list(itertools.combinations(range(n_sites), 2))
    original_groups = groups.copy()
    groups.reverse()
    position_orders = list(zip(original_groups, groups))

    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_array = np.empty(len(active_AAs)*len(position_orders))
    fitness_dict = {}

    for i, start_seq in tqdm.tqdm(enumerate(active_AAs)):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]
        
        # Loop through all possible orders of positions
        for j, temp_order in enumerate(position_orders):
                
            best_seq = start_seq
            best_fitness = start_fitness
            
            # Loop through the positions
            for pos in temp_order:

                # Try all possible mutations at the position
                for AA in AA_list:
                    temp_seq = make_new_sequence(best_seq, AA, pos[0])
                    for AA in AA_list:
                        temp_seq = make_new_sequence(temp_seq, AA, pos[1])
                    
                        # Use Try/Except in case the AA combo doesn't exist in the dataframe
                        try:
                            temp_fitness = data_dict[temp_seq]
                        except:
                            temp_fitness = 0
                        
                        # If this sequence is better than any previous then keep it
                        if temp_fitness > best_fitness:
                            best_seq = temp_seq
                            best_fitness = temp_fitness
                        else:
                            pass
            fitness_array[len(position_orders)*i+j] = best_fitness
            fitness_dict[(start_seq, temp_order)] = [start_fitness, best_seq, best_fitness]

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'level_0':'start_seq', 'level_1':'order', 0:'start_fitness', 1:'final_seq', 2:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

    # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])

    return (fitness_array,output_df)

In [14]:
# simulate DE on GB1
# GB1_SSMN_output_df = sample_SSM_test_top_N(GB1_9783, 'AAs', 'Fitness', n_sites=4) #stops at 7337?
# GB1_SSMr_output_df = simulate_simple_SSM_recomb_DE(GB1_9783, 'AAs', 'Fitness', n_sites=4)
GB1_ssSSM_fitness_array, GB1_ssSSM_output_df = simulate_single_step_DE(GB1_data, 'AAs', 'Fitness', n_sites=4)
GB1_dsSSM_fitness_array, GB1_dsSSM_output_df = simulate_double_step_DE(GB1_data, 'AAs', 'Fitness', n_sites=4)


# GB1_SSMr_output_df['final_fitness'] = GB1_SSMr_output_df['final_fitness']/GB1_SSMr_output_df['final_fitness'].max()
GB1_ssSSM_output_df['final_fitness'] = GB1_ssSSM_output_df['final_fitness']/GB1_ssSSM_output_df['final_fitness'].max()
GB1_dsSSM_output_df['final_fitness'] = GB1_dsSSM_output_df['final_fitness']/GB1_dsSSM_output_df['final_fitness'].max()

0it [00:00, ?it/s]

34545it [00:19, 1803.50it/s]
34545it [00:51, 675.69it/s]


In [17]:
# simulate DE on TrpB
#TrpB_SSMN_output_df = sample_SSM_test_top_N(TrpB_data, 'AAs', 'fitness', n_sites=4) # stops at 7337?
# TrpB_SSMr_output_df = simulate_simple_SSM_recomb_DE(TrpB_data, 'AAs', 'fitness', n_sites=4)
TrpB_ssSSM_fitness_array, TrpB_ssSSM_output_df = simulate_single_step_DE(TrpB_data, 'AAs', 'fitness', n_sites=4)
TrpB_dsSSM_fitness_array, TrpB_dsSSM_output_df = simulate_double_step_DE(TrpB_data, 'AAs', 'fitness', n_sites=4)

# TrpB_SSMr_output_df['final_fitness'] = TrpB_SSMr_output_df['final_fitness']/TrpB_SSMr_output_df['final_fitness'].max()
TrpB_ssSSM_output_df['final_fitness'] = TrpB_ssSSM_output_df['final_fitness']/TrpB_ssSSM_output_df['final_fitness'].max()
TrpB_dsSSM_output_df['final_fitness'] = TrpB_dsSSM_output_df['final_fitness']/TrpB_dsSSM_output_df['final_fitness'].max()

10140it [00:05, 1721.50it/s]
10140it [00:16, 612.64it/s]


In [18]:
GB1_ssSSM_output_df

Unnamed: 0,start_seq,order,start_fitness,final_seq,final_fitness,final_fitness ECDF,rank
0,VDGV,"(0, 1, 2, 3)",1.0,LYGV,0.579242,0.414543,90.0
1,VDGV,"(0, 1, 3, 2)",1.0,LYGV,0.579242,0.414544,90.0
2,VDGV,"(0, 2, 1, 3)",1.0,LYGV,0.579242,0.414545,90.0
3,VDGV,"(0, 2, 3, 1)",1.0,LYGV,0.579242,0.414546,90.0
4,VDGV,"(0, 3, 1, 2)",1.0,LYGV,0.579242,0.414547,90.0
...,...,...,...,...,...,...,...
829075,YYYT,"(3, 0, 2, 1)",0.0212,LWLG,0.735277,0.775997,16.0
829076,YYYT,"(3, 1, 0, 2)",0.0212,LILG,0.532094,0.358467,140.0
829077,YYYT,"(3, 1, 2, 0)",0.0212,WIFG,0.584684,0.458596,85.0
829078,YYYT,"(3, 2, 0, 1)",0.0212,WWLG,0.784978,0.838877,10.0


In [19]:
GB1_dsSSM_output_df

Unnamed: 0,start_seq,order,start_fitness,final_seq,final_fitness,final_fitness ECDF,rank
0,VDGV,"((0, 1), (2, 3))",1.0,LYCA,0.600399,0.277416,70.0
1,VDGV,"((0, 2), (1, 3))",1.0,WWFG,0.723981,0.516809,18.0
2,VDGV,"((0, 3), (1, 2))",1.0,FWAA,1.0,0.913219,1.0
3,VDGV,"((1, 2), (0, 3))",1.0,IWGF,0.65876,0.406011,33.0
4,VDGV,"((1, 3), (0, 2))",1.0,IWGF,0.65876,0.406016,33.0
...,...,...,...,...,...,...,...
207265,YYYT,"((0, 2), (1, 3))",0.0212,FYGN,0.627128,0.382371,46.0
207266,YYYT,"((0, 3), (1, 2))",0.0212,LWLG,0.735277,0.585570,16.0
207267,YYYT,"((1, 2), (0, 3))",0.0212,FYAA,0.91819,0.913215,2.0
207268,YYYT,"((1, 3), (0, 2))",0.0212,WIFG,0.584684,0.209123,85.0


In [20]:
# TrpB single-step SSM final_fitness median, mean
print([TrpB_ssSSM_output_df['final_fitness'].median(), TrpB_ssSSM_output_df['final_fitness'].mean()])

[0.6585520961351918, 0.6145349340721439]


In [21]:
# GB1 single-step SSM final_fitness median, mean
print([GB1_ssSSM_output_df['final_fitness'].median(), GB1_ssSSM_output_df['final_fitness'].mean()])

[0.5973187685820347, 0.5715228732086163]


In [22]:
# combining ssSSM results
temp_GB1 = GB1_ssSSM_output_df.reset_index()
temp_GB1['protein'] = 'GB1'

# temp_syn_GB1 = syn_GB1_ssSSM_output_df.reset_index()
# temp_syn_GB1['protein'] = 'syn_GB1'

temp_TrpB = TrpB_ssSSM_output_df.reset_index()
temp_TrpB['protein'] = 'TrpB'


#combined_fitness_df = pd.concat([temp_TrpB, temp_GB1]).set_index(['protein', 'start_seq', 'final_seq', 'final_fitness']).sort_index()
combined_fitness_df = pd.concat([temp_GB1, temp_TrpB]).set_index(['protein', 'start_seq', 'final_seq', 'final_fitness']).sort_index()

In [23]:
combined_fitness_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,index,order,start_fitness,final_fitness ECDF,rank
protein,start_seq,final_seq,final_fitness,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GB1,AAAA,ANCA,0.862463,253190,"(2, 1, 0, 3)",1.61161,0.954437,3.0
GB1,AAAA,ANCA,0.862463,253191,"(2, 1, 3, 0)",1.61161,0.954439,3.0
GB1,AAAA,ANCA,0.862463,253193,"(2, 3, 1, 0)",1.61161,0.954440,3.0
GB1,AAAA,ANCA,0.862463,253199,"(3, 2, 1, 0)",1.61161,0.954441,3.0
GB1,AAAA,FHAA,0.646607,253182,"(1, 0, 2, 3)",1.61161,0.620021,38.0
...,...,...,...,...,...,...,...,...
TrpB,YYYG,YYYG,0.086705,243355,"(3, 0, 2, 1)",0.086705,0.068339,6205.0
TrpB,YYYG,YYYG,0.086705,243356,"(3, 1, 0, 2)",0.086705,0.068343,6205.0
TrpB,YYYG,YYYG,0.086705,243357,"(3, 1, 2, 0)",0.086705,0.068347,6205.0
TrpB,YYYG,YYYG,0.086705,243358,"(3, 2, 0, 1)",0.086705,0.068351,6205.0


In [24]:
# combining TrpB and syn_TrpB
# temp_TrpB_1 = TrpB_SSMr_output_df.reset_index()
# temp_TrpB_1['protein'] = 'TrpB'
# temp_TrpB_1['method'] = 'SSMr'

temp_TrpB_2 = TrpB_ssSSM_output_df.reset_index()
temp_TrpB_2['protein'] = 'TrpB'
temp_TrpB_2['method'] = 'ssSSM'

temp_TrpB_3 = TrpB_dsSSM_output_df.reset_index()
temp_TrpB_3['protein'] = 'TrpB'
temp_TrpB_3['method'] = 'dsSSM'

# temp_syn_TrpB_1 = syn_TrpB_SSMr_output_df.reset_index()
# temp_syn_TrpB_1['protein'] = 'syn_TrpB'
# temp_syn_TrpB_1['method'] = 'SSMr'

temp_GB1_2 = GB1_ssSSM_output_df.reset_index()
temp_GB1_2['protein'] = 'GB1'
temp_GB1_2['method'] = 'ssSSM'

temp_GB1_3 = GB1_dsSSM_output_df.reset_index()
temp_GB1_3['protein'] = 'GB1'
temp_GB1_3['method'] = 'dsSSM'

combined_fitness_df = pd.concat([temp_TrpB_2, temp_TrpB_3, temp_GB1_2, temp_GB1_3]).set_index(['protein', 'method', 'start_seq', 'final_seq', 'final_fitness']).sort_index()

In [25]:
combined_fitness_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,index,order,start_fitness,final_fitness ECDF,rank
protein,method,start_seq,final_seq,final_fitness,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GB1,dsSSM,AAAA,ANCA,0.862463,63297,"((1, 2), (0, 3))",1.61161,0.831722,3.0
GB1,dsSSM,AAAA,ANCA,0.862463,63299,"((2, 3), (0, 1))",1.61161,0.831727,3.0
GB1,dsSSM,AAAA,FHAA,0.646607,63298,"((1, 3), (0, 2))",1.61161,0.394814,38.0
GB1,dsSSM,AAAA,FWAA,1.000000,63294,"((0, 1), (2, 3))",1.61161,0.947735,1.0
GB1,dsSSM,AAAA,VAAA,0.707733,63295,"((0, 2), (1, 3))",1.61161,0.500545,21.0
...,...,...,...,...,...,...,...,...,...
TrpB,ssSSM,YYYG,YYYG,0.086705,243355,"(3, 0, 2, 1)",0.086705,0.068339,6205.0
TrpB,ssSSM,YYYG,YYYG,0.086705,243356,"(3, 1, 0, 2)",0.086705,0.068343,6205.0
TrpB,ssSSM,YYYG,YYYG,0.086705,243357,"(3, 1, 2, 0)",0.086705,0.068347,6205.0
TrpB,ssSSM,YYYG,YYYG,0.086705,243358,"(3, 2, 0, 1)",0.086705,0.068351,6205.0


In [26]:
# Plot Hooks
def one_decimal_x(plot,element):
    plot.handles['plot'].xaxis[0].formatter = NumeralTickFormatter(format="0.0")

def one_decimal_y(plot,element):
    plot.handles['plot'].yaxis[0].formatter = NumeralTickFormatter(format="0.0")

def fixmargins(plot,element):
    plot.handles['plot'].min_border_right=30
    plot.handles['plot'].min_border_left=65
    plot.handles['plot'].min_border_top=20
    plot.handles['plot'].min_border_bottom=65
    plot.handles['plot'].outline_line_color='black'
    plot.handles['plot'].outline_line_alpha=1
    plot.handles['plot'].outline_line_width=1
    plot.handles['plot'].toolbar.autohide = True

In [27]:
figure_2b = hv.Violin(
    combined_fitness_df.sort_index(ascending=False),
    kdims=['protein', 'method'],
    vdims=['final_fitness']
).opts(
    # split='protein',
    frame_height=300,
    frame_width=300,
    violin_width=0.8,
    fontscale=1.3,
    hooks=[fixmargins],
    # show_legend=True,
    # inner=None,
    cut = 0,
    # legend_position='top',
    ylabel='Max Fitness Achieved',
    violin_color=hv.dim('protein').str(),
    cmap = 'Category10',
    ylim = (0,1)
)


figure_2b

In [24]:
hv.save(figure_2b, 'dsSSM_ssSSM_GB1_TrpB.png', fmt='png')



In [17]:
# # plot ssSSM as hist
# from bokeh.plotting import figure, show

# p = figure(width=670, height=400, toolbar_location=None,
#            title="Rank")

# # Histogram
# bins = np.linspace(-3, 3, 40)
# hist, edges = np.histogram(x, density=True, bins=bins)
# p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
#          fill_color="skyblue", line_color="white",
#          legend_label="1000 random samples")

# show(p)


frequencies, edges = np.histogram(syn_TrpB_ssSSM_output_df['rank'])
print('Values: %s, Edges: %s' % (frequencies.shape[0], edges.shape[0]))
hv.Histogram((edges, frequencies))

Values: 10, Edges: 11
