In [1]:
%cd ~/SSMuLA

/disk2/fli/SSMuLA


In [2]:
%load_ext blackcellmagic
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
import pandas as pd

In [4]:

import holoviews as hv
from holoviews import dim


hv.extension("bokeh")

from SSMuLA.landscape_global import LIB_NAMES, TrpB_names
from SSMuLA.vis import (
    save_bokeh_hv,
    JSON_THEME,
    LIB_COLORS,
    one_decimal_x,
    one_decimal_y,
    fixmargins,
)
from SSMuLA.util import get_file_name, checkNgen_folder

hv.renderer("bokeh").theme = JSON_THEME


In [5]:
df = pd.read_csv("results/pairwise_epistasis_vis/none/scale2max.csv")
df

Unnamed: 0,lib,n_mut,pos_calc_filter_min,summary_type,epistasis_type,value
0,DHFR,all,none,count,magnitude,538000.000000
1,DHFR,single,none,count,magnitude,24575.000000
2,DHFR,double,none,count,magnitude,285763.000000
3,DHFR,all,none,count,sign,369049.000000
4,DHFR,single,none,count,sign,17743.000000
...,...,...,...,...,...,...
211,TrpB4,single,none,fraction,sign,0.381144
212,TrpB4,double,none,fraction,sign,0.365450
213,TrpB4,all,none,fraction,reciprocal sign,0.092746
214,TrpB4,single,none,fraction,reciprocal sign,0.079531


In [6]:
df[df["summary_type"] == "fraction"]

Unnamed: 0,lib,n_mut,pos_calc_filter_min,summary_type,epistasis_type,value
9,DHFR,all,none,fraction,magnitude,0.581696
10,DHFR,single,none,fraction,magnitude,0.567290
11,DHFR,double,none,fraction,magnitude,0.579917
12,DHFR,all,none,fraction,sign,0.399023
13,DHFR,single,none,fraction,sign,0.409580
...,...,...,...,...,...,...
211,TrpB4,single,none,fraction,sign,0.381144
212,TrpB4,double,none,fraction,sign,0.365450
213,TrpB4,all,none,fraction,reciprocal sign,0.092746
214,TrpB4,single,none,fraction,reciprocal sign,0.079531


In [7]:
from SSMuLA.pairwise_epistasis import EPISTASIS_TYPE

In [None]:
def hook(plot,element):
    plot.handles['plot'].x_range.factors = [(lib, epistasis) for lib in LIB_NAMES for epistasis in EPISTASIS_TYPE]

# Create the Holoviews Bars element
bars = hv.Bars(df[df["summary_type"] == "fraction"], 
               kdims=["lib", "epistasis_type"], 
               vdims="value"
               ).opts(
    width=1200,
    height=400,
    show_legend=True,
    legend_position="top",
    legend_offset=(0, 5),
    ylabel="Fraction",
    multi_level=False,
    title="Fraction of pairwise epistasis types",
    xlabel="Library",
    hooks=[fixmargins, one_decimal_y, hook],
    # x_range_factor = [
    #         (lib, epistasis)
    #         for lib in LIB_NAMES
    #         for epistasis in EPISTASIS_TYPE
    #     ]
        )
bars

In [None]:
import os
from glob import glob

In [None]:
# results/pairwise_epistasis_vis/none/scale2max.csv
# make bar plots base on that and save to the same directory
def hook(plot, element):
    plot.handles["plot"].x_range.factors = [
        (lib, epistasis) for lib in LIB_NAMES for epistasis in EPISTASIS_TYPE
    ]

# Create the Holoviews Bars element
save_bokeh_hv(
    hv.Bars(
        df[df["summary_type"] == "fraction"],
        kdims=["lib", "epistasis_type"],
        vdims="value",
    ).opts(
        width=1200,
        height=400,
        show_legend=True,
        legend_position="top",
        legend_offset=(0, 5),
        ylabel="Fraction",
        multi_level=False,
        title="Fraction of pairwise epistasis types",
        xlabel="Library",
        hooks=[fixmargins, one_decimal_y, hook],
    ),
    plot_name="scale2max",
    plot_path=os.path.join("results/pairwise_epistasis_vis", "none"),
)


In [None]:
import ast

In [None]:
# try zs summary
zs_sum_df = pd.read_csv("results/zs_sum/none/zs_stat_scale2max.csv")
# make the nested dict in zs_sum_df to be flat as columns and make this a melted df
zs_sum_df_melt = zs_sum_df.melt(
    id_vars=["lib", "n_mut"],
    value_vars=["Triad_score", "ev_score", "esm_score"],
    var_name="zs_type",
    value_name="corr",
)

# Concatenate the expanded columns back to the original DataFrame
df_expanded = pd.concat(
    [
        zs_sum_df_melt.drop("corr", axis=1),
        zs_sum_df_melt["corr"].apply(ast.literal_eval).apply(pd.Series),
    ],
    axis=1,
)
df_expanded

In [None]:
df_score = df_expanded.melt(id_vars=["lib", "n_mut", "zs_type"], value_vars=["rho", "ndcg", "rocauc"], var_name="metric", value_name="value")
df_score

In [None]:
import holoviews as hv
hv.extension('bokeh')

In [None]:
from SSMuLA.zs_analysis import ZS_OPTS

In [None]:
def hook(plot,element):
    plot.handles['plot'].x_range.factors = [(lib, zs) for lib in LIB_NAMES for zs in ZS_OPTS]

In [None]:
for metric in ["rho", "ndcg", "rocauc"]:

    # Create the Holoviews Bars element
    save_bokeh_hv(
        hv.Bars(df_score[df_score["metric"] == metric], 
                kdims=["lib", "zs_type"], 
                vdims="value"
                ).opts(
        width=1200,
        height=400,
        show_legend=True,
        legend_position="top",
        legend_offset=(0, 5),
        ylabel=f"{metric} correlation",
        multi_level=False,
        title=f"ZS fitness {metric} correlation",
        xlabel="Library",
        hooks=[fixmargins, one_decimal_y, hook],
            ),
        plot_name=f"zs_stat_scale2max-{metric}",
        plot_path=os.path.join("results/zs_sum", "none"),
    )


In [None]:
def noesmhook(plot,element):
    plot.handles['plot'].x_range.factors = [(lib, zs) for lib in LIB_NAMES for zs in ["Triad_score", "ev_score"]]

for metric in ["rho", "ndcg", "rocauc"]:

    # Create the Holoviews Bars element
    save_bokeh_hv(
        hv.Bars(df_score[(df_score["metric"] == metric) & (df_score["zs_type"] != "esm_score")], 
                kdims=["lib", "zs_type"], 
                vdims="value"
                ).opts(
        width=1200,
        height=400,
        show_legend=True,
        legend_position="top",
        legend_offset=(0, 5),
        ylabel=f"{metric} correlation",
        multi_level=False,
        title=f"ZS fitness {metric} correlation",
        xlabel="Library",
        hooks=[fixmargins, one_decimal_y, noesmhook],
            ),
        plot_name=f"zs_stat_scale2max-{metric}-noesm",
        plot_path=os.path.join("results/zs_sum", "none"),
    )


In [8]:
# try de sim sum
df = pd.read_csv("results/simulations/DE-active/scale2max/all_landscape_de_summary.csv")
df

Unnamed: 0,lib,de_type,mean_all,median_all,mean_top96,median_top96,mean_top384,median_top384,fraction_max
0,DHFR,single_step_DE,0.889922,0.857847,1.0,1.0,1.0,1.0,0.283568
1,DHFR,recomb_SSM,0.851463,0.847249,0.999315,1.0,0.949198,0.959943,0.090164
2,DHFR,top96_SSM,0.959305,1.0,1.0,1.0,1.0,1.0,0.632319
3,GB1,single_step_DE,0.571523,0.597319,1.0,1.0,1.0,1.0,0.026045
4,GB1,recomb_SSM,0.362927,0.37017,0.978695,1.0,0.887291,0.862211,0.002055
5,GB1,top96_SSM,0.611348,0.620935,1.0,1.0,1.0,1.0,0.02504
6,TrpB3A,single_step_DE,0.401312,0.200246,0.993309,1.0,,,0.254237
7,TrpB3A,recomb_SSM,0.401256,0.190336,,,,,0.220339
8,TrpB3A,top96_SSM,0.428916,0.230815,,,,,0.288136
9,TrpB3B,single_step_DE,0.271319,0.12797,0.294215,0.12797,,,0.166667


In [None]:
# Fill NaNs in 'mean_top96' and 'median_top96' from 'mean_all' and 'median_all'
df['mean_top96'] = df['mean_top96'].fillna(df['mean_all'])
df['median_top96'] = df['median_top96'].fillna(df['median_all'])

# Fill NaNs in 'mean_top384' and 'median_top384' from 'mean_top96' and 'median_top96'
df['mean_top384'] = df['mean_top384'].fillna(df['mean_top96'])
df['median_top384'] = df['median_top384'].fillna(df['median_top96'])

df

In [None]:
for lib in sorted(glob("data/TrpB/scale2max/TrpB3*.csv")):
    if "codon" not in lib:
        trpb_df = pd.read_csv(lib)
        if trpb_df.loc[trpb_df["fitness"].idxmax()]["fitness"] != 1:
            print(trpb_df.loc[trpb_df["fitness"].idxmax()])

In [None]:
# Find the index of the row with the max value in column 'C'
max_index = df['C'].idxmax()

# Retrieve the row with the max value in column 'C'
max_row = df.loc[max_index]

In [None]:
de_metric_map = {
    "mean_all": "all simulations fitness mean",
    "median_all": "all simulations fitness median",
    "mean_top96": "top 96 simulations fitness mean",
    "median_top96":	"top 96 simulations fitness median",
    "mean_top384": "top 384 simulations fitness mean",
    "median_top384": "top 384 simulations fitness median",
    "fraction_max": "fraction reached max fitness",
}

In [None]:
def de_hook(plot,element):
    plot.handles['plot'].x_range.factors = [(lib, de) for lib in LIB_NAMES for de in ["single_step_DE", "recomb_SSM", "top96_SSM"]]

for metric, metric_dets in de_metric_map.items():

    title = f"DE from active variant {metric_dets}"

    save_bokeh_hv(
        hv.Bars(df, kdims=["lib", "de_type"], vdims=metric).opts(
        width=1200,
        height=400,
        show_legend=True,
        legend_position="top",
        legend_offset=(0, 5),
        ylabel=metric_dets.capitalize(),
        multi_level=False,
        title=title,
        xlabel="Library",
        hooks=[fixmargins, one_decimal_y, de_hook],
    ),
        plot_name=title,
        plot_path=os.path.join("results/simulations/DE-active", "scale2max", "summary"),
    )

In [7]:
mlde_df = pd.read_csv("results/mlde/vis/all_df.csv")
mlde_df

Unnamed: 0,encoding,model,n_sample,ft_lib,rep,maxes_all,means_all,maxes,means,ndcgs,rhos,if_truemaxs,truemax_inds,n_mut_cutoff,lib,zs,n_top
0,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,0,0.840951,0.156936,1.000000,0.499483,0.930975,0.498705,1.0,267.0,all,DHFR,Triad_score,384
1,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,1,0.837710,0.159552,0.996537,0.500002,0.932816,0.480707,0.0,,all,DHFR,Triad_score,384
2,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,2,0.841871,0.158632,0.996537,0.464790,0.901309,0.464723,0.0,,all,DHFR,Triad_score,384
3,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,3,0.812338,0.180189,1.000000,0.474807,0.926957,0.420841,1.0,2.0,all,DHFR,Triad_score,384
4,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,4,0.841787,0.162283,1.000000,0.485674,0.923643,0.430931,1.0,274.0,all,DHFR,Triad_score,384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245995,one-hot,ridge,384,77,95,0.771698,0.017581,0.749107,0.330329,0.979702,0.190245,0.0,,single,TrpB4,none,96
245996,one-hot,ridge,384,77,96,0.597178,0.060315,0.749107,0.361424,0.980491,0.190741,0.0,,single,TrpB4,none,96
245997,one-hot,ridge,384,77,97,0.634966,0.009730,0.752895,0.428029,0.981345,0.193477,0.0,,single,TrpB4,none,96
245998,one-hot,ridge,384,77,98,0.783760,0.040222,0.749107,0.284615,0.979312,0.188739,0.0,,single,TrpB4,none,96


In [None]:
mlde_df_2 = pd.read_csv("results/mlde_old/vis/all_df.csv")
mlde_df_2

In [None]:
pd.set_option("display.max_rows", None)

sliced_df_no_zs_top96_all = mlde_df[
    (mlde_df["zs"] == "none")
    & (mlde_df["n_top"] == 96)
    & (mlde_df["n_mut_cutoff"] == "all")
][
    [
        "lib",
        "encoding",
        "model",
        "maxes_all",
        "means_all",
        "maxes",
        "means",
        "ndcgs",
        "rhos",
        "if_truemaxs",
        "truemax_inds",
    ]
].groupby(
    ["lib", "encoding", "model"]
).mean()

sliced_df_no_zs_top96_all

In [None]:
pd.reset_option('display.max_rows')


In [5]:

import bokeh
from bokeh.io import show, export_svg, export_png
from bokeh.plotting import show
from bokeh.themes.theme import Theme

bokeh.io.output_notebook()

import holoviews as hv
from holoviews import opts

hv.extension("bokeh", "matplotlib")


from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()

In [None]:
sliced_df_no_zs_top96_all.reset_index()

In [None]:
sliced_df_no_zs_top96_all.reset_index()

In [8]:
slice_mlde_df = mlde_df[
        (mlde_df["zs"] == "none")
        & (mlde_df["n_top"] == 96)
        & (mlde_df["n_mut_cutoff"] == "all")
        & (mlde_df["encoding"].isin(["esm2_t33_650M_UR50D-flatten_site", "one-hot"]))
    ][["lib", "encoding", "model", "maxes", "rep"]]
slice_mlde_df["encoding"] = slice_mlde_df["encoding"].str.replace("esm2_t33_650M_UR50D-flatten_site", "esm")
slice_mlde_df = slice_mlde_df.reset_index(drop=True)
slice_mlde_df = slice_mlde_df.sort_values(by=["lib", "encoding", "model"])
slice_mlde_df.head()

Unnamed: 0,lib,encoding,model,maxes,rep
0,DHFR,esm,boosting,1.0,0
1,DHFR,esm,boosting,1.0,1
2,DHFR,esm,boosting,1.0,2
3,DHFR,esm,boosting,1.0,3
4,DHFR,esm,boosting,1.0,4


In [15]:
slice_mlde_df[(slice_mlde_df["lib"] == "TrpB3F") & (slice_mlde_df["encoding"] == "esm") & (slice_mlde_df["model"] == "ridge")]

Unnamed: 0,lib,encoding,model,maxes,rep
2150,TrpB3F,esm,ridge,1.0,0
2151,TrpB3F,esm,ridge,1.0,1
2152,TrpB3F,esm,ridge,1.0,2
2153,TrpB3F,esm,ridge,1.0,3
2154,TrpB3F,esm,ridge,1.0,4
2155,TrpB3F,esm,ridge,1.0,5
2156,TrpB3F,esm,ridge,1.0,6
2157,TrpB3F,esm,ridge,1.0,7
2158,TrpB3F,esm,ridge,1.0,8
2159,TrpB3F,esm,ridge,1.0,9


In [16]:
slice_mlde_df[
    (slice_mlde_df["lib"] == "TrpB3F")
    & (slice_mlde_df["encoding"] == "esm")
    & (slice_mlde_df["model"] == "ridge")
    & (slice_mlde_df["rep"] == 0)
]

Unnamed: 0,lib,encoding,model,maxes,rep
2150,TrpB3F,esm,ridge,1.0,0


In [17]:
slice_mlde_df.loc[
    
        (slice_mlde_df["lib"] == "TrpB3F")
        & (slice_mlde_df["encoding"] == "esm")
        & (slice_mlde_df["model"] == "ridge")
        & (slice_mlde_df["rep"] == 0)
    ,
    "maxes",
] = 0.999999

In [18]:
def hook(plot, element):
    plot.handles["plot"].x_range.factors = [
        (lib, encoding, model)
        for lib in LIB_NAMES
        for encoding in ["esm", "one-hot"]
        for model in ["boosting", "ridge"]
    ]


hv.Violin(
    slice_mlde_df.copy(),
    kdims=["lib", "encoding", "model"],
    vdims=["maxes"],
).opts(
    width=1200,
    height=400,
    ylim=(0, 1),
    # inner="box",
    ylabel="Max fitness",
    xlabel="Library",
    title="Max fitness distribution",
    violin_width=0.9,
    # split='model',
    violin_fill_color=hv.dim("encoding").str(),
    violin_line_color=hv.dim("model").str(),
    show_legend=True,
    # multi_level=True
    # show_legend=True,
    legend_position="top",
    legend_offset=(0, 5),
    hooks=[fixmargins, one_decimal_y, hook]
)

In [None]:
%matplotlib inline

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assume `data` is a DataFrame with all values being the same
sns.violinplot(data=slice_mlde_df, x='lib', y='maxes', hue="encoding",)
plt.show()


In [None]:
g = sns.catplot(slice_mlde_df, x='lib', y='maxes', hue="encoding", col='model', kind='violin')
g.set_titles("{col_name}")
plt.show()


In [None]:


# results/pairwise_epistasis_vis/none/scale2max.csv
# make bar plots base on that and save to the same directory
# def hook(plot, element):
#     plot.handles["plot"].x_range.factors = [
#         (lib, epistasis) for lib in LIB_NAMES for epistasis in EPISTASIS_TYPE
#     ]

# Create the Holoviews Bars element
# save_bokeh_hv(
#     hv.Bars(
#         df[df["summary_type"] == "fraction"],
#         kdims=["lib", "epistasis_type"],
#         vdims="value",
#     ).opts(
#         width=1200,
#         height=400,
#         show_legend=True,
#         legend_position="top",
#         legend_offset=(0, 5),
#         ylabel="Fraction",
#         multi_level=False,
#         title="Fraction of pairwise epistasis types",
#         xlabel="Library",
#         hooks=[fixmargins, one_decimal_y, hook],
#     ),
#     plot_name="scale2max",
#     plot_path=os.path.join("results/pairwise_epistasis_vis", "none"),
# )


In [None]:
mlde_df = pd.read_csv("results/mlde/vis/all_df.csv")
mlde_df

In [7]:
mlde_df = pd.read_csv("results/mlde/vis_2/all_df.csv")
mlde_df

Unnamed: 0,encoding,model,n_sample,ft_lib,rep,all_maxes,all_means,top_maxes,top_means,ndcgs,rhos,if_truemaxs,truemax_inds,n_mut_cutoff,lib,zs,n_top
0,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,0,0.840951,0.156936,1.000000,0.499483,0.930975,0.498705,1.0,267.0,all,DHFR,Triad_score,384
1,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,1,0.837710,0.159552,0.996537,0.500002,0.932816,0.480707,0.0,686.0,all,DHFR,Triad_score,384
2,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,2,0.841871,0.158632,0.996537,0.464790,0.901309,0.464723,0.0,618.0,all,DHFR,Triad_score,384
3,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,3,0.812338,0.180189,1.000000,0.474807,0.926957,0.420841,1.0,2.0,all,DHFR,Triad_score,384
4,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,4,0.841787,0.162283,1.000000,0.485674,0.923643,0.430931,1.0,274.0,all,DHFR,Triad_score,384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689995,one-hot,ridge,576,77,45,0.744631,0.004855,0.739434,0.287961,0.978963,0.189986,0.0,13237.0,single,TrpB4,none,96
689996,one-hot,ridge,576,77,46,0.771967,0.016475,0.749107,0.300604,0.979479,0.188705,0.0,10338.0,single,TrpB4,none,96
689997,one-hot,ridge,576,77,47,0.697771,0.006323,0.749107,0.333802,0.980334,0.194473,0.0,13049.0,single,TrpB4,none,96
689998,one-hot,ridge,576,77,48,0.674717,0.013695,0.749107,0.343322,0.980304,0.190111,0.0,8280.0,single,TrpB4,none,96


In [10]:
mlde_df[mlde_df.isna().any(axis=1)]

Unnamed: 0,encoding,model,n_sample,ft_lib,rep,all_maxes,all_means,top_maxes,top_means,ndcgs,rhos,if_truemaxs,truemax_inds,n_mut_cutoff,lib,zs,n_top


In [9]:
mlde_df["n_sample"].unique()

array([384,  96, 192, 288, 480, 576])

In [None]:
len(mlde_df[(mlde_df["encoding"] == "one-hot")])

In [None]:
mlde_df[(mlde_df["encoding"] == "one-hot") & (mlde_df["lib"] == "GB1") & (mlde_df["model"] == "boosting") & (mlde_df["rep"] == 0)]

In [None]:
len(mlde_df[(mlde_df["encoding"] == "one-hot") & (mlde_df["lib"] == "GB1") & (mlde_df["model"] == "boosting") & (mlde_df["rep"] == 0)])

In [None]:
mlde_df.truemax_inds.unique()

In [5]:
# summary for local opts

from glob import glob
import pandas as pd 

In [9]:
loc_opt_list = glob("results/local_optima/scale2max/*.csv")
loc_opt_list

['results/local_optima/scale2max/TrpB3F_loc_opt_escape.csv',
 'results/local_optima/scale2max/TrpB3E_loc_opt_escape.csv',
 'results/local_optima/scale2max/DHFR_loc_opt_escape.csv',
 'results/local_optima/scale2max/TrpB3B_loc_opt_escape.csv',
 'results/local_optima/scale2max/TrpB3H_loc_opt_escape.csv',
 'results/local_optima/scale2max/TrpB3G_loc_opt_escape.csv',
 'results/local_optima/scale2max/TrpB4_loc_opt_escape.csv',
 'results/local_optima/scale2max/TrpB3A_loc_opt_escape.csv',
 'results/local_optima/scale2max/TrpB3I_loc_opt_escape.csv',
 'results/local_optima/scale2max/TrpB3D_loc_opt_escape.csv',
 'results/local_optima/scale2max/TrpB3C_loc_opt_escape.csv']

In [8]:
pd.read_csv("results/local_optima/scale2max/DHFR_loc_opt_escape.csv")

Unnamed: 0,AAs,AA1,AA2,AA3,fitness,active,muts,n_escape,"(0, 1)","(0, 2)","(1, 2)",frac pairs no escape,frac pairs that escape
0,FDV,F,D,V,0.846091,True,A26F:L28V,5,0,0,0,1.0,0.0
1,KEM,K,E,M,1.0,True,A26K:D27E:L28M,0,0,0,0,1.0,0.0
2,TDW,T,D,W,0.847249,True,A26T:L28W,4,0,0,0,1.0,0.0
