In [1]:
%cd ~/SSMuLA

/disk2/fli/SSMuLA


In [2]:
%load_ext blackcellmagic
%load_ext autoreload
%autoreload 2

In [3]:
# General imports
import glob
import os
import re
import pickle
import datetime

# Data manipulation
import pandas as pd
import numpy as np

# Basic plotting
import bokeh
from bokeh.io import export_svg
from bokeh.models import NumeralTickFormatter
from bokeh.io import output_notebook
output_notebook()


import panel as pn
pn.config.comms = "vscode"

# Large data plotting
import datashader as ds
from holoviews.operation.datashader import datashade, rasterize

# Making graphs
import networkx as nx
import matplotlib.pyplot as plt
import itertools
from tqdm.auto import tqdm
from multiprocessing import Pool
from operator import itemgetter

import holoviews as hv
from holoviews import opts, dim
hv.extension('bokeh')

from SSMuLA.vis import JSON_THEME, save_bokeh_hv, one_decimal_x, one_decimal_y, fixmargins, LIB_COLORS

from bokeh.themes.theme import Theme

hv.renderer('bokeh').theme = JSON_THEME

from SSMuLA.aa_global import ALL_AAS
from SSMuLA.landscape_global import LIB_POS_0_IDX
from SSMuLA.util import checkNgen_folder, get_file_name, get_dir_name

In [4]:
from SSMuLA.landscape_optima import LocOpt

### Functions needed for downstream analyses

In [5]:
AA_list = list('ACDEFGHIKLMNPQRSTVWY')

def make_new_sequence(input_seq, new_AA, position):
    """
    Makes a new sequence by replacing the AA at a given position with a new AA
    """
    seq_list = list(input_seq)
    seq_list[position] = new_AA
    return ''.join(seq_list)

def hamming(str1, str2):
    """
    Calculates the hamming distance between two strings (how many edits between them)
    """
    assert len(str1) == len(str2)

    distance = 0
    for i in range(len(str1)):
        if str1[i] != str2[i]:
            distance += 1
    return distance

def determine_optima(active_variant, df, fit_col):
    """
    Determines how many single substitution variants are above a given variant in fitness.

    Parameters:
    -----------
    active_variant : str
        The sequence of the active/starting variant
    df : pd.DataFrame
        The dataframe containing the fitness data
    fit_col : str
        The column in the dataframe containing the fitness data

    Returns:
    --------
    variant_rank : int
        The rank of the active variant in the dataframe
    """

    # slice out the variant and all the variants at hamming distance 1. Then sort the dataframe by descending fitness
    temp = df[(df['AAs'].apply(lambda x: hamming(x, active_variant) == 1)) | (df['AAs'] == active_variant)].sort_values(fit_col, ascending=False).reset_index(drop=True).copy()

    # determine the rank of the active variant/how many variants are more active than it (0 is the best)
    variant_rank = temp[temp['AAs'] == active_variant].index[0]

    return variant_rank

### Import the TrpB data

In [9]:
df = pd.read_csv("/disk2/fli/SSMuLA/data/TrpB/scale2max/TrpB4.csv")
df = df[~df["AAs"].str.contains("\*")]
df

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,# Stop,fitness,active,muts
9594,AAAA,A,A,A,A,0.0,0.074455,True,V183A:F184A:V227A:S228A
9595,AAAC,A,A,A,C,0.0,0.056314,True,V183A:F184A:V227A:S228C
9596,AAAD,A,A,A,D,0.0,0.014342,False,V183A:F184A:V227A:S228D
9597,AAAE,A,A,A,E,0.0,0.012914,False,V183A:F184A:V227A:S228E
9598,AAAF,A,A,A,F,0.0,0.005161,False,V183A:F184A:V227A:S228F
...,...,...,...,...,...,...,...,...,...
193165,YYYS,Y,Y,Y,S,0.0,0.016578,False,V183Y:F184Y:V227Y
193166,YYYT,Y,Y,Y,T,0.0,0.030715,False,V183Y:F184Y:V227Y:S228T
193167,YYYV,Y,Y,Y,V,0.0,-0.000589,False,V183Y:F184Y:V227Y:S228V
193168,YYYW,Y,Y,Y,W,0.0,-0.033119,False,V183Y:F184Y:V227Y:S228W


In [6]:
df = pd.read_csv("/disk2/fli/SSMuLA/data/GB1/scale2max/GB1.csv")
df = df[~df["AAs"].str.contains("\*")]
df

Unnamed: 0,AAs,fitness,active,muts
0,VDGV,0.114130,True,WT
1,ADGV,0.007066,True,V39A
2,CDGV,0.027646,True,V39C
3,DDGV,0.000739,False,V39D
4,EDGV,0.003734,True,V39E
...,...,...,...,...
149356,YYYR,0.000154,False,V39Y:D40Y:G41Y:V54R
149357,YYYS,0.000505,False,V39Y:D40Y:G41Y:V54S
149358,YYYT,0.002420,True,V39Y:D40Y:G41Y:V54T
149359,YYYW,0.001043,False,V39Y:D40Y:G41Y:V54W


In [6]:
# Import the imputed TrpB_data
TrpB_imputed_data = pd.read_csv(
     '../../../data/figure_data/4-site_imputed/20230828_KNN_imputed_TrpB.csv', 
    index_col=0
)

TrpB_imputed_data['imputed'] = True

# Import the measured TrpB_data
TrpB_measured_data = pd.read_csv(
    '../../../data/figure_data/4-site_merged_replicates/20230827/four-site_simplified_AA_data.csv',
)

TrpB_measured_data = TrpB_measured_data[TrpB_measured_data['# Stop'] == 0].copy().drop(columns=['# Stop'])

TrpB_measured_data['imputed'] = False

# Combine and sort the data
TrpB_data = pd.concat([TrpB_imputed_data, TrpB_measured_data]).sort_values('AAs').reset_index(drop=True)

TrpB_data

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,fitness,active,imputed
0,AAAA,A,A,A,A,0.074455,True,False
1,AAAC,A,A,A,C,0.056314,True,False
2,AAAD,A,A,A,D,0.014342,False,False
3,AAAE,A,A,A,E,0.012914,False,False
4,AAAF,A,A,A,F,0.005161,False,False
...,...,...,...,...,...,...,...,...
159995,YYYS,Y,Y,Y,S,0.016578,False,False
159996,YYYT,Y,Y,Y,T,0.030715,False,False
159997,YYYV,Y,Y,Y,V,-0.000589,False,False
159998,YYYW,Y,Y,Y,W,-0.033119,False,False


### Import the GB1 data

In [7]:
# VDGV is parent

# Import the measured GB1 data
GB1_measured_data = pd.read_csv('../../../data/figure_data/GB1_data/GB1_Fitness.csv').rename(columns={'AAString': 'AAs'}).drop(columns=['Mutations'])

GB1_measured_data['imputed'] = False

# Import the imputed GB1 data
GB1_imputed_data = pd.read_excel('../../../data/figure_data/GB1_data/GB1_missing_data.xlsx').rename(columns={'Variants': 'AAs', 'Imputed fitness': 'Fitness'})

GB1_imputed_data['imputed'] = True

# Combine the data and add AA1 -> AA4 columns
GB1_data = pd.concat([GB1_measured_data, GB1_imputed_data], ignore_index=True).sort_values('AAs').reset_index(drop=True)

for i in range(4):
    GB1_data.insert(i+1, f'AA{i+1}', GB1_data['AAs'].apply(lambda x: x[i]))

# Get the Fitness/max column to scale the data the same way as the TrpB data
GB1_data['Fitness/max'] = GB1_data['Fitness'] / GB1_data['Fitness'].max()
GB1_fit_min = 0.01

# Only set as active if they are not imputed and have a fitness above the minimum. This will prevent them from being included as starting points in the path analysis, but they will still appear in the graphs.
GB1_data['active'] = GB1_data.apply(lambda x: (x['Fitness'] > GB1_fit_min) & (x['imputed'] == False), axis=1)

GB1_data

  warn(msg)


Unnamed: 0,AAs,AA1,AA2,AA3,AA4,Fitness,imputed,Fitness/max,active
0,AAAA,A,A,A,A,1.611610,False,0.162574,True
1,AAAC,A,A,A,C,0.049726,True,0.005016,False
2,AAAD,A,A,A,D,0.011857,True,0.001196,False
3,AAAE,A,A,A,E,0.011416,True,0.001152,False
4,AAAF,A,A,A,F,0.029688,True,0.002995,False
...,...,...,...,...,...,...,...,...,...
159995,YYYS,Y,Y,Y,S,0.004421,False,0.000446,False
159996,YYYT,Y,Y,Y,T,0.021200,False,0.002139,True
159997,YYYV,Y,Y,Y,V,0.041952,False,0.004232,True
159998,YYYW,Y,Y,Y,W,0.009136,False,0.000922,False


### Find local optima among the active TrpB variants by finding the rank of the variant among all its single substitutions and just taking those which are the maximum fitness

In [10]:
# Get the active variants and the args for multiprocessing
active_variants = df[df["active"]]["AAs"].values
pool_args = [(x, df, "fitness") for x in active_variants]

# For every active variant determine its rank among its single mutants
with Pool(16) as pool:
    results = pool.starmap(determine_optima, tqdm(pool_args))

find_optima_dict = {active_variants[i]: results[i] for i in range(len(active_variants))}

# Convert this data to a dataframe and merge it with the TrpB_data to get the fitness information
temp = pd.merge(
    df.copy(),
    pd.DataFrame(find_optima_dict, index=["n_greater"])
    .T.sort_values("n_greater", ascending=False)
    .reset_index()
    .rename(columns={"index": "AAs"}),
)

# Get the local optima by finding the variants where no single mutant is more fit
local_optima = temp[temp["n_greater"] == 0].reset_index(drop=True).copy()
local_optima.sort_values("fitness", ascending=False)

  0%|          | 0/9783 [00:00<?, ?it/s]

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,# Stop,fitness,active,muts,n_greater
7,AIKG,A,I,K,G,0.0,1.000000,True,V183A:F184I:V227K:S228G,0
25,CLKG,C,L,K,G,0.0,0.932034,True,V183C:F184L:V227K:S228G,0
419,VLCS,V,L,C,S,0.0,0.752895,True,F184L:V227C,0
5,AICS,A,I,C,S,0.0,0.746142,True,V183A:F184I:V227C,0
299,NLLS,N,L,L,S,0.0,0.739434,True,V183N:F184L:V227L,0
...,...,...,...,...,...,...,...,...,...,...
149,HFDA,H,F,D,A,0.0,0.056115,True,V183H:V227D:S228A,0
33,DAHH,D,A,H,H,0.0,0.056108,True,V183D:F184A:V227H:S228H,0
97,FENC,F,E,N,C,0.0,0.055796,True,V183F:F184E:V227N:S228C,0
87,FAWL,F,A,W,L,0.0,0.055441,True,V183F:F184A:V227W:S228L,0


In [7]:
# Get the active variants and the args for multiprocessing
active_variants = df[df["active"]]["AAs"].values
pool_args = [(x, df, "fitness") for x in active_variants]

# For every active variant determine its rank among its single mutants
with Pool(16) as pool:
    results = pool.starmap(determine_optima, tqdm(pool_args))

find_optima_dict = {active_variants[i]: results[i] for i in range(len(active_variants))}

# Convert this data to a dataframe and merge it with the TrpB_data to get the fitness information
temp = pd.merge(
    df.copy(),
    pd.DataFrame(find_optima_dict, index=["n_greater"])
    .T.sort_values("n_greater", ascending=False)
    .reset_index()
    .rename(columns={"index": "AAs"}),
)

# Get the local optima by finding the variants where no single mutant is more fit
local_optima = temp[temp["n_greater"] == 0].reset_index(drop=True).copy()
local_optima.sort_values("fitness", ascending=False)

  0%|          | 0/34545 [00:00<?, ?it/s]

Unnamed: 0,AAs,fitness,active,muts,n_greater
95,FWAA,1.000000,True,V39F:D40W:G41A:V54A,0
35,ANCA,0.862463,True,V39A:D40N:G41C:V54A,0
97,FWLG,0.834591,True,V39F:D40W:G41L:V54G,0
20,VAAA,0.707733,True,D40A:G41A:V54A,0
10,IWGF,0.658760,True,V39I:D40W:V54F,0
...,...,...,...,...,...
102,GPIE,0.001401,True,V39G:D40P:G41I:V54E,0
129,NQMR,0.001325,True,V39N:D40Q:G41M:V54R,0
93,FPQQ,0.001303,True,V39F:D40P:G41Q:V54Q,0
65,DLAY,0.001197,True,V39D:D40L:G41A:V54Y,0


In [8]:
parent_fit = df[df['AAs'] == 'VDGV']['fitness'].values[0]

# add a column for rank
temp = local_optima.copy()
temp['rank'] = temp['fitness'].rank(ascending=False)

fig = hv.Scatter(
    temp,
    kdims=['rank'],
    vdims=['fitness']
).opts(
    xlabel='Rank',
    ylabel='Fitness',
    title='Local optima fitness'
)*hv.HLine(parent_fit).opts(color='grey')

fig

In [22]:
parent_fit = df[df['AAs'] == 'VFVS']['fitness'].values[0]

# add a column for rank
temp = local_optima.copy()
temp['rank'] = temp['fitness'].rank(ascending=False)

fig = hv.Scatter(
    temp,
    kdims=['rank'],
    vdims=['fitness']
).opts(
    xlabel='Rank',
    ylabel='Fitness',
    title='Local optima fitness'
)*hv.HLine(parent_fit).opts(color='grey')

fig

## Top twenty local optima

In [10]:
local_optima[local_optima['fitness'] > 0.1].sort_values('fitness', ascending=False).head(20)

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,fitness,active,imputed,n_greater
7,AIKG,A,I,K,G,1.0,True,False,0
25,CLKG,C,L,K,G,0.932034,True,False,0
415,VLCS,V,L,C,S,0.752895,True,False,0
5,AICS,A,I,C,S,0.746142,True,False,0
295,NLLS,N,L,L,S,0.739434,True,False,0
390,SLVS,S,L,V,S,0.703612,True,False,0
259,LPKG,L,P,K,G,0.699687,True,False,0
128,GCLS,G,C,L,S,0.676871,True,False,0
387,SIVN,S,I,V,N,0.67254,True,False,0
11,AVVN,A,V,V,N,0.644781,True,False,0


In [11]:
top_twenty = local_optima.sort_values('fitness', ascending=False)[:20].copy()
print(top_twenty['fitness'].describe())

(hv.Distribution(
    top_twenty,
    'fitness',
)*hv.VLine(top_twenty['fitness'].mean(), label='mean')*hv.VLine(top_twenty['fitness'].median(), label='median'))+\
hv.Histogram(
    np.histogram(top_twenty['fitness'], bins=20),
).opts(xlim=(0,1))

count    20.000000
mean      0.637802
std       0.154375
min       0.385816
25%       0.543032
50%       0.622284
75%       0.712568
max       1.000000
Name: fitness, dtype: float64


### Save for use in other analyses

In [8]:
# file_date = datetime.datetime.now().strftime('%Y%m%d')
# local_optima.to_csv('../../../data/figure_data/4-site_outputs/' + f'{file_date}_local_optima.csv')

In [24]:
local_optima.sort_values("fitness", ascending=False).reset_index(drop=True)

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,# Stop,fitness,active,muts,n_greater
0,AIKG,A,I,K,G,0.0,1.000000,True,V183A:F184I:V227K:S228G,0
1,CLKG,C,L,K,G,0.0,0.932034,True,V183C:F184L:V227K:S228G,0
2,VLCS,V,L,C,S,0.0,0.752895,True,F184L:V227C,0
3,AICS,A,I,C,S,0.0,0.746142,True,V183A:F184I:V227C,0
4,NLLS,N,L,L,S,0.0,0.739434,True,V183N:F184L:V227L,0
...,...,...,...,...,...,...,...,...,...,...
520,HFDA,H,F,D,A,0.0,0.056115,True,V183H:V227D:S228A,0
521,DAHH,D,A,H,H,0.0,0.056108,True,V183D:F184A:V227H:S228H,0
522,FENC,F,E,N,C,0.0,0.055796,True,V183F:F184E:V227N:S228C,0
523,FAWL,F,A,W,L,0.0,0.055441,True,V183F:F184A:V227W:S228L,0


In [34]:
local_optima.n_greater.value_counts()

0    525
Name: n_greater, dtype: int64

In [12]:
print(f'fraction of active variants that are local optima: {len(local_optima) / len(TrpB_data[TrpB_data["active"]]) * 100:.3f}%')

print(f'fraction of total variants that are local optima: {len(local_optima) / len(TrpB_data) * 100:.3f}%')

fraction of active variants that are local optima: 5.315%
fraction of total variants that are local optima: 0.325%


In [9]:
print(f'fraction of active variants that are local optima: {len(local_optima) / len(df[df["active"]]) * 100:.3f}%')

print(f'fraction of total variants that are local optima: {len(local_optima) / len(df) * 100:.3f}%')

fraction of active variants that are local optima: 0.524%
fraction of total variants that are local optima: 0.121%


### How many of these optima can be escaped with double-site-saturation mutagenesis?

In [15]:
find_HD2_escape = {}

# loop through the optima
for opt_variant in tqdm(local_optima["AAs"].values):

    # slice out the variant and all the variants at hamming distance 2. Then sort the dataframe by descending fitness
    temp = (
        df[
            (df["AAs"].apply(lambda x: hamming(x, opt_variant) == 2))
            | (df["AAs"] == opt_variant)
        ]
        .sort_values("fitness", ascending=False)
        .reset_index(drop=True)
        .copy()
    )

    # determine the rank of the active variant/how many variants are more active than it. If the variant is not 0 it can escape
    find_HD2_escape[opt_variant] = temp[temp["AAs"] == opt_variant].index[0]

# merge the data with the TrpB_data to get the fitness information
temp = pd.merge(
    df.copy(),
    pd.DataFrame(find_HD2_escape, index=["n_greater"])
    .T.sort_values("n_greater", ascending=False)
    .reset_index()
    .rename(columns={"index": "AAs"}),
)

# Find the variants that can escape
HD2_can_escape = (
    temp[temp["n_greater"] != 0]
    .reset_index(drop=True)
    .sort_values("fitness", ascending=False)
)

# find the variants that cannot escape
HD2_cannot_escape = (
    temp[temp["n_greater"] == 0]
    .reset_index(drop=True)
    .sort_values("fitness", ascending=False)
)

# Print the results!
print(
    "fraction of active variants that are local optima: {:.3f}% (n={})".format(
        len(local_optima) / len(df[df["active"]]) * 100, len(local_optima)
    )
)

print(
    "fraction of total variants that are local optima: {:.3f}% (n={})".format(
        len(local_optima) / len(df) * 100, len(local_optima)
    )
)

print(
    "fraction of local optima that can be escaped with double-site saturation mutagenesis: {:.2f}% (n={}))".format(
        len(HD2_can_escape) / len(local_optima) * 100, len(HD2_can_escape)
    )
)

print(
    "fraction of local optima still cannot be escaped with double-site saturation mutagenesis: {:.2f}% (n={})".format(
        len(HD2_cannot_escape) / len(local_optima) * 100, len(HD2_cannot_escape)
    )
)

  0%|          | 0/525 [00:00<?, ?it/s]

fraction of active variants that are local optima: 5.366% (n=525)
fraction of total variants that are local optima: 0.330% (n=525)
fraction of local optima that can be escaped with double-site saturation mutagenesis: 97.90% (n=514))
fraction of local optima still cannot be escaped with double-site saturation mutagenesis: 2.10% (n=11)


In [10]:
find_HD2_escape = {}

# loop through the optima
for opt_variant in tqdm(local_optima["AAs"].values):

    # slice out the variant and all the variants at hamming distance 2. Then sort the dataframe by descending fitness
    temp = (
        df[
            (df["AAs"].apply(lambda x: hamming(x, opt_variant) == 2))
            | (df["AAs"] == opt_variant)
        ]
        .sort_values("fitness", ascending=False)
        .reset_index(drop=True)
        .copy()
    )

    # determine the rank of the active variant/how many variants are more active than it. If the variant is not 0 it can escape
    find_HD2_escape[opt_variant] = temp[temp["AAs"] == opt_variant].index[0]

# merge the data with the TrpB_data to get the fitness information
temp = pd.merge(
    df.copy(),
    pd.DataFrame(find_HD2_escape, index=["n_greater"])
    .T.sort_values("n_greater", ascending=False)
    .reset_index()
    .rename(columns={"index": "AAs"}),
)

# Find the variants that can escape
HD2_can_escape = (
    temp[temp["n_greater"] != 0]
    .reset_index(drop=True)
    .sort_values("fitness", ascending=False)
)

# find the variants that cannot escape
HD2_cannot_escape = (
    temp[temp["n_greater"] == 0]
    .reset_index(drop=True)
    .sort_values("fitness", ascending=False)
)

# Print the results!
print(
    "fraction of active variants that are local optima: {:.3f}% (n={})".format(
        len(local_optima) / len(df[df["active"]]) * 100, len(local_optima)
    )
)

print(
    "fraction of total variants that are local optima: {:.3f}% (n={})".format(
        len(local_optima) / len(df) * 100, len(local_optima)
    )
)

print(
    "fraction of local optima that can be escaped with double-site saturation mutagenesis: {:.2f}% (n={}))".format(
        len(HD2_can_escape) / len(local_optima) * 100, len(HD2_can_escape)
    )
)

print(
    "fraction of local optima still cannot be escaped with double-site saturation mutagenesis: {:.2f}% (n={})".format(
        len(HD2_cannot_escape) / len(local_optima) * 100, len(HD2_cannot_escape)
    )
)

  0%|          | 0/181 [00:00<?, ?it/s]

fraction of active variants that are local optima: 0.524% (n=181)
fraction of total variants that are local optima: 0.121% (n=181)
fraction of local optima that can be escaped with double-site saturation mutagenesis: 98.90% (n=179))
fraction of local optima still cannot be escaped with double-site saturation mutagenesis: 1.10% (n=2)


In [25]:
HD2_can_escape

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,# Stop,fitness,active,muts,n_greater
24,CLKG,C,L,K,G,0.0,0.932034,True,V183C:F184L:V227K:S228G,1
411,VLCS,V,L,C,S,0.0,0.752895,True,F184L:V227C,3
5,AICS,A,I,C,S,0.0,0.746142,True,V183A:F184I:V227C,3
294,NLLS,N,L,L,S,0.0,0.739434,True,V183N:F184L:V227L,3
386,SLVS,S,L,V,S,0.0,0.703612,True,V183S:F184L,7
...,...,...,...,...,...,...,...,...,...,...
146,HFDA,H,F,D,A,0.0,0.056115,True,V183H:V227D:S228A,127
32,DAHH,D,A,H,H,0.0,0.056108,True,V183D:F184A:V227H:S228H,58
96,FENC,F,E,N,C,0.0,0.055796,True,V183F:F184E:V227N:S228C,57
86,FAWL,F,A,W,L,0.0,0.055441,True,V183F:F184A:V227W:S228L,32


In [26]:
HD2_cannot_escape

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,# Stop,fitness,active,muts,n_greater
0,AIKG,A,I,K,G,0.0,1.0,True,V183A:F184I:V227K:S228G,0
3,HKHG,H,K,H,G,0.0,0.318798,True,V183H:F184K:V227H:S228G,0
7,RQYV,R,Q,Y,V,0.0,0.290775,True,V183R:F184Q:V227Y:S228V,0
5,NYYW,N,Y,Y,W,0.0,0.227946,True,V183N:F184Y:V227Y:S228W,0
6,PYDD,P,Y,D,D,0.0,0.200796,True,V183P:F184Y:V227D:S228D,0
4,KRNW,K,R,N,W,0.0,0.199166,True,V183K:F184R:V227N:S228W,0
2,HDFE,H,D,F,E,0.0,0.189496,True,V183H:F184D:V227F:S228E,0
1,FWFI,F,W,F,I,0.0,0.151151,True,V183F:F184W:V227F:S228I,0
10,YHWN,Y,H,W,N,0.0,0.143094,True,V183Y:F184H:V227W:S228N,0
9,WTWT,W,T,W,T,0.0,0.129934,True,V183W:F184T:V227W:S228T,0


In [14]:
find_HD3_escape = {}

# Loop through the optima that could not escae with double-site saturation mutagenesis
for opt_variant in tqdm(HD2_cannot_escape['AAs'].values):
    
    # slice out the variant and all the variants at hamming distance 3. Then sort the dataframe by descending fitness
    temp = TrpB_data[(TrpB_data['AAs'].apply(lambda x: hamming(x, opt_variant) == 3)) | (TrpB_data['AAs'] == opt_variant)].sort_values('fitness', ascending=False).reset_index(drop=True).copy()

    # determine the rank of the active variant/how many variants are more active than it. If the variant is not 0 it can escape
    find_HD3_escape[opt_variant] = temp[temp['AAs'] == opt_variant].index[0]

# merge the data with the TrpB_data to get the fitness information
temp = pd.merge(
    TrpB_data.copy(),
    pd.DataFrame(find_HD3_escape, index=['n_greater']).T.sort_values('n_greater', ascending=False).reset_index().rename(columns={'index':'AAs'})
)

# Find the variants that can escape
HD3_can_escape = temp[temp['n_greater'] != 0].reset_index(drop=True).sort_values('fitness', ascending=False)

# find the variants that cannot escape
HD3_cannot_escape = temp[temp['n_greater'] == 0].reset_index(drop=True).sort_values('fitness', ascending=False)

# Print a bunch of information about the escape data
print(f'fraction of active variants that are local optima: {len(local_optima) / len(TrpB_data[TrpB_data["active"]]) * 100:.3f}% (n={len(local_optima)})')

print(f'fraction of total variants that are local optima: {len(local_optima) / len(TrpB_data) * 100:.3f}% (n={len(local_optima)})')

print(f'fraction of local optima that can be escaped with double-site saturation mutagenesis: {len(HD2_can_escape) / len(local_optima) * 100:.2f}% (n={len(HD2_can_escape)})')

print(f'fraction of local optima that still cannot be escaped with double-site saturation mutagenesis: {len(HD2_cannot_escape) / len(local_optima) * 100:.2f}% (n={len(HD2_cannot_escape)})')

print(f'fraction of optima which could not be escaped by single- or double-site saturation mutagenesis that can be escaped with triple-site saturation mutagenesis: {len(HD3_can_escape) / len(HD2_cannot_escape) * 100:.2f}% (n={len(HD3_can_escape)})')

print('\nThis neglects that AIKG is included, which can never be "escaped". If we remove AIKG, then:')

print(f'fraction of active variants that are local optima: {(len(local_optima)-1) / len(TrpB_data[TrpB_data["active"]]) * 100:.3f}% (n={len(local_optima)-1})')

print(f'fraction of total variants that are local optima: {(len(local_optima)-1) / len(TrpB_data) * 100:.3f}% (n={len(local_optima)-1})')

print(f'fraction of local optima that can be escaped with double-site saturation mutagenesis: {len(HD2_can_escape) / (len(local_optima)-1) * 100:.2f}% (n={len(HD2_can_escape)})')

print(f'fraction of local optima that still cannot be escaped with double-site saturation mutagenesis: {(len(HD2_cannot_escape)-1) / (len(local_optima)-1) * 100:.2f}% (n={len(HD2_cannot_escape)-1})')

print(f'fraction of optima which could not be escaped by single- or double-site saturation mutagenesis that can be escaped with triple-site saturation mutagenesis: {len(HD3_can_escape) / (len(HD2_cannot_escape)-1) * 100:.2f}% (n={len(HD3_can_escape)})')

  0%|          | 0/10 [00:00<?, ?it/s]

fraction of active variants that are local optima: 5.315% (n=520)
fraction of total variants that are local optima: 0.325% (n=520)
fraction of local optima that can be escaped with double-site saturation mutagenesis: 98.08% (n=510)
fraction of local optima that still cannot be escaped with double-site saturation mutagenesis: 1.92% (n=10)
fraction of optima which could not be escaped by single- or double-site saturation mutagenesis that can be escaped with triple-site saturation mutagenesis: 90.00% (n=9)

This neglects that AIKG is included, which can never be "escaped". If we remove AIKG, then:
fraction of active variants that are local optima: 5.305% (n=519)
fraction of total variants that are local optima: 0.324% (n=519)
fraction of local optima that can be escaped with double-site saturation mutagenesis: 98.27% (n=510)
fraction of local optima that still cannot be escaped with double-site saturation mutagenesis: 1.73% (n=9)
fraction of optima which could not be escaped by single- or

## Great, but we might not pick the right two pairs of sites for double-site saturation and we might not randomly sample it either.

Calculate the fraction of the (six) possible pairs of positions that could be chosen for double-site saturation mutagenesis.

In [27]:
sequences = df['AAs'].values
fitnesses = df['fitness'].values

data_dict = {seq:fit for seq,fit in zip(sequences,fitnesses)}

In [31]:
find_HD2_escape = {}
double_site_escape = {}

# Loop through the optima
for opt_variant in tqdm(local_optima["AAs"].values):

    # slice out the variant and all the variants at hamming distance 2. Then sort the dataframe by descending fitness
    temp = (
        df[
            (df["AAs"].apply(lambda x: hamming(x, opt_variant) == 2))
            | (df["AAs"] == opt_variant)
        ]
        .sort_values("fitness", ascending=False)
        .reset_index(drop=True)
        .copy()
    )

    # determine the rank of the active variant/how many variants are more active than it. If the variant is not 0 it can escape
    find_HD2_escape[opt_variant] = temp[temp["AAs"] == opt_variant].index[0]
    
    # This time save the entire DataFrame for some downstream analyses
    double_site_escape[opt_variant] = temp

# merge the data with the TrpB_data to get the fitness information
HD2_escape_df = pd.merge(
    df.copy(),
    pd.DataFrame(find_HD2_escape, index=["n_escape"])
    .T.sort_values("n_escape", ascending=False)
    .reset_index()
    .rename(columns={"index": "AAs"}),
)

# Find the variants that can escape
HD2_can_escape = (
    HD2_escape_df[HD2_escape_df["n_escape"] != 0]
    .reset_index(drop=True)
    .sort_values("fitness", ascending=False)
)

# find the variants that cannot escape
HD2_cannot_escape = (
    HD2_escape_df[HD2_escape_df["n_escape"] == 0]
    .reset_index(drop=True)
    .sort_values("fitness", ascending=False)
)

print(
    "fraction of local optima that can be escaped with double-site saturation mutagenesis: {:.2f}% (n={}))".format(
        len(HD2_can_escape) / len(local_optima) * 100, len(HD2_can_escape)
    )
)

print(
    "fraction of local optima still cannot be escaped with double-site saturation mutagenesis: {:.2f}% (n={})".format(
        len(HD2_cannot_escape) / len(local_optima) * 100, len(HD2_cannot_escape)
    )
)
    
    
# Get all possible pairs of positions that could be included in the escape double
position_sets = list(itertools.combinations(range(4), 2))

result_dict = {}

# for every local optima
for var_of_interest in tqdm(local_optima["AAs"].values):

    # Get the dataframe for that variant
    result_dict[var_of_interest] = {}
    var_fit = data_dict[var_of_interest]
    temp = double_site_escape[var_of_interest].copy()

    # For every pair of positions
    for position1, position2 in position_sets:

        # find the mutants that escape the double-site mutant
        _temp = temp[temp["fitness"] > var_fit].reset_index(drop=True)
        _temp = _temp[
            (_temp[f"AA{position1+1}"] == var_of_interest[position1])
            & (_temp[f"AA{position2+1}"] == var_of_interest[position2])
        ]

        # save the number of mutants for that pair that escape
        result_dict[var_of_interest][(position1, position2)] = len(_temp)

# Convert these results to a DataFrame
local_optima_escape = pd.DataFrame(result_dict).T
local_optima_escape.columns = [
    "".join(str(col)).strip() for col in local_optima_escape.columns.values
]

# Add some columns to the dataframe for the fraction that do and do not escape
local_optima_escape["frac pairs no escape"] = local_optima_escape.apply(
    lambda x: list(x).count(0) / 6, axis=1
)
local_optima_escape["frac pairs that escape"] = local_optima_escape.apply(
    lambda x: 1 - list(x).count(0) / 6, axis=1
)

# Add the fitness of the local optima to the dataframe
local_optima_escape = local_optima_escape.reset_index().rename(columns={"index": "AAs"})
local_optima_escape["fitness"] = local_optima_escape["AAs"].apply(
    lambda x: data_dict[x]
)

local_optima_escape

  0%|          | 0/525 [00:00<?, ?it/s]

fraction of local optima that can be escaped with double-site saturation mutagenesis: 97.90% (n=514))
fraction of local optima still cannot be escaped with double-site saturation mutagenesis: 2.10% (n=11)


  0%|          | 0/525 [00:00<?, ?it/s]

Unnamed: 0,AAs,"(0, 1)","(0, 2)","(0, 3)","(1, 2)","(1, 3)","(2, 3)",frac pairs no escape,frac pairs that escape,fitness
0,ADIG,0,4,31,0,0,27,0.500000,0.500000,0.520769
1,ADQQ,10,12,0,0,1,0,0.500000,0.500000,0.220680
2,AFMS,0,3,17,0,0,2,0.500000,0.500000,0.588559
3,AHFW,17,9,0,2,5,0,0.333333,0.666667,0.085170
4,AHKI,7,19,0,0,0,0,0.666667,0.333333,0.167142
...,...,...,...,...,...,...,...,...,...,...
520,YYER,10,13,0,7,1,7,0.166667,0.833333,0.059044
521,YYLW,8,2,10,19,7,1,0.000000,0.833333,0.061844
522,YYRM,0,1,2,0,3,0,0.500000,0.500000,0.089476
523,YYWH,3,3,2,9,1,5,0.000000,0.833333,0.070629


In [32]:
HD2_escape_df

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,# Stop,fitness,active,muts,n_escape
0,ADIG,A,D,I,G,0.0,0.520769,True,V183A:F184D:V227I:S228G,62
1,ADQQ,A,D,Q,Q,0.0,0.220680,True,V183A:F184D:V227Q:S228Q,23
2,AFMS,A,F,M,S,0.0,0.588559,True,V183A:V227M,22
3,AHFW,A,H,F,W,0.0,0.085170,True,V183A:F184H:V227F:S228W,33
4,AHKI,A,H,K,I,0.0,0.167142,True,V183A:F184H:V227K:S228I,26
...,...,...,...,...,...,...,...,...,...,...
520,YYER,Y,Y,E,R,0.0,0.059044,True,V183Y:F184Y:V227E:S228R,38
521,YYLW,Y,Y,L,W,0.0,0.061844,True,V183Y:F184Y:V227L:S228W,47
522,YYRM,Y,Y,R,M,0.0,0.089476,True,V183Y:F184Y:V227R:S228M,6
523,YYWH,Y,Y,W,H,0.0,0.070629,True,V183Y:F184Y:V227W:S228H,23


In [33]:
merged_df = pd.merge(HD2_escape_df, local_optima_escape, on='AAs')
merged_df


Unnamed: 0,AAs,AA1,AA2,AA3,AA4,# Stop,fitness_x,active,muts,n_escape,"(0, 1)","(0, 2)","(0, 3)","(1, 2)","(1, 3)","(2, 3)",frac pairs no escape,frac pairs that escape,fitness_y
0,ADIG,A,D,I,G,0.0,0.520769,True,V183A:F184D:V227I:S228G,62,0,4,31,0,0,27,0.500000,0.500000,0.520769
1,ADQQ,A,D,Q,Q,0.0,0.220680,True,V183A:F184D:V227Q:S228Q,23,10,12,0,0,1,0,0.500000,0.500000,0.220680
2,AFMS,A,F,M,S,0.0,0.588559,True,V183A:V227M,22,0,3,17,0,0,2,0.500000,0.500000,0.588559
3,AHFW,A,H,F,W,0.0,0.085170,True,V183A:F184H:V227F:S228W,33,17,9,0,2,5,0,0.333333,0.666667,0.085170
4,AHKI,A,H,K,I,0.0,0.167142,True,V183A:F184H:V227K:S228I,26,7,19,0,0,0,0,0.666667,0.333333,0.167142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,YYER,Y,Y,E,R,0.0,0.059044,True,V183Y:F184Y:V227E:S228R,38,10,13,0,7,1,7,0.166667,0.833333,0.059044
521,YYLW,Y,Y,L,W,0.0,0.061844,True,V183Y:F184Y:V227L:S228W,47,8,2,10,19,7,1,0.000000,0.833333,0.061844
522,YYRM,Y,Y,R,M,0.0,0.089476,True,V183Y:F184Y:V227R:S228M,6,0,1,2,0,3,0,0.500000,0.500000,0.089476
523,YYWH,Y,Y,W,H,0.0,0.070629,True,V183Y:F184Y:V227W:S228H,23,3,3,2,9,1,5,0.000000,0.833333,0.070629


In [29]:
list(itertools.combinations(range(4), 2))


[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]

In [30]:
list(itertools.combinations(range(3), 2))


[(0, 1), (0, 2), (1, 2)]

In [22]:
# Histogram of fraction of pairs that escape
hv.Histogram(np.histogram(local_optima_escape['frac pairs that escape'].values, bins=6)).opts(
    ylabel='Number of variants',
    xlabel='Fraction of pairs that escape',
)

## GB1

In [18]:
GB1_active_df = GB1_data[GB1_data['Fitness'] > 0.01].reset_index(drop=True)
active_variants = GB1_active_df['AAs'].values
pool_args = [(x, GB1_data, 'Fitness/max') for x in active_variants]

with Pool(16) as pool:
    results = pool.starmap(determine_optima, tqdm(pool_args))

find_GB1_optima_dict = {active_variants[i]: results[i] for i in range(len(active_variants))}

temp = pd.merge(
    GB1_data.copy(),
    pd.DataFrame(find_GB1_optima_dict, index=['n_greater']).T.sort_values('n_greater', ascending=False).reset_index().rename(columns={'index':'AAs'})
)

GB1_local_optima = temp[temp['n_greater']==0].reset_index(drop=True).copy()
GB1_local_optima.sort_values('Fitness/max', ascending=False)

  0%|          | 0/43582 [00:00<?, ?it/s]

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,Fitness,imputed,Fitness/max,active,n_greater
0,AHCA,A,H,C,A,9.913088,True,1.0,False,0
11,FWAA,F,W,A,A,8.761966,False,0.883879,True,0
13,FWLG,F,W,L,G,7.312656,False,0.737677,True,0
25,VAAA,V,A,A,A,6.201135,False,0.62555,True,0
16,IWGF,I,W,G,F,5.772032,False,0.582264,True,0
2,ANLG,A,N,L,G,5.5913,False,0.564032,True,0
14,FYGN,F,Y,G,N,5.494876,False,0.554305,True,0
12,FWGS,F,W,G,S,5.380293,False,0.542746,True,0
24,TYGM,T,Y,G,M,5.242722,False,0.528869,True,0
19,LYGV,L,Y,G,V,5.075299,False,0.51198,True,0


### I confirm that I found the same local optima that the authors found

In [19]:
GB1_my_locals = set(GB1_local_optima[GB1_local_optima['Fitness'] > 1].reset_index(drop=True)['AAs'].values)
print(GB1_my_locals)

GB1_paper_locals = set(['PQKD','VAAA','FWAA','FWLG','ANLG','IWGF','IGQV','FWGS','FYGN','WNWY','LYGV','TYGM','WYGW','IYGC','AHCA'])
print(GB1_paper_locals)

print(GB1_my_locals-GB1_paper_locals)
print(GB1_paper_locals-GB1_my_locals)

{'TYGM', 'WYGW', 'IWGF', 'LYGV', 'IYGC', 'IGQV', 'FWAA', 'FWLG', 'AHCA', 'ANLG', 'FYGN', 'PQKD', 'WNWY', 'VAAA', 'FWGS'}
{'TYGM', 'WYGW', 'IWGF', 'LYGV', 'IYGC', 'IGQV', 'FWAA', 'FWLG', 'AHCA', 'ANLG', 'FYGN', 'PQKD', 'WNWY', 'VAAA', 'FWGS'}
set()
set()


In [20]:
top_twenty_GB1 = GB1_local_optima.sort_values('Fitness/max', ascending=False)[:20].copy()
print(top_twenty_GB1['Fitness/max'].describe())

(hv.Distribution(
    top_twenty_GB1,
    'Fitness/max',
)*hv.VLine(top_twenty_GB1['Fitness/max'].mean(), label='mean')*hv.VLine(top_twenty_GB1['Fitness/max'].median(), label='median'))+\
hv.Histogram(
    np.histogram(top_twenty_GB1['Fitness/max'], bins=20),
)

count    20.000000
mean      0.407619
std       0.301055
min       0.009216
25%       0.122454
50%       0.482244
75%       0.568590
max       1.000000
Name: Fitness/max, dtype: float64


### How many local optima are better than parent for the TrpB landscape?
19

In [21]:
# TrpB local optima (greater than parent), 19 greater than parent including AIKG
parent_fitness = TrpB_data[TrpB_data['AAs'] == 'VFVS']['fitness'].values[0]
local_optima[local_optima['fitness'] >= parent_fitness].sort_values('fitness', ascending=False).reset_index(drop=True)

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,fitness,active,imputed,n_greater
0,AIKG,A,I,K,G,1.0,True,False,0
1,CLKG,C,L,K,G,0.932034,True,False,0
2,VLCS,V,L,C,S,0.752895,True,False,0
3,AICS,A,I,C,S,0.746142,True,False,0
4,NLLS,N,L,L,S,0.739434,True,False,0
5,SLVS,S,L,V,S,0.703612,True,False,0
6,LPKG,L,P,K,G,0.699687,True,False,0
7,GCLS,G,C,L,S,0.676871,True,False,0
8,SIVN,S,I,V,N,0.67254,True,False,0
9,AVVN,A,V,V,N,0.644781,True,False,0


### Save notebook as HTML

In [23]:
# os.system('jupyter nbconvert --to html local_optima.ipynb')

[NbConvertApp] Converting notebook local_optima.ipynb to html
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
  {%- elif type == 'text/vnd.mermaid' -%}
[NbConvertApp] Writing 515507 bytes to local_optima.html


0