In [1]:
%cd ~/SSMuLA

/disk2/fli/SSMuLA


In [2]:
%load_ext autoreload
%autoreload 2
%load_ext blackcellmagic

In [3]:
import os
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.colors import LogNorm
import pickle


font = {'family' : 'arial',
        'size'   : 16}
mpl.rc('font', **font)
mpl.rc('lines', linewidth=2)
mpl.rcParams['axes.linewidth'] = 2
mpl.rcParams['xtick.major.width'] = 2
mpl.rcParams['ytick.major.width'] = 2

# General imports
import glob
import os
import re
import pickle
import datetime

# Data manipulation
#import growth_analysis as ga
import pandas as pd
#from multiprocesspandas import applyparallel
import numpy as np
from sklearn.impute import KNNImputer

# Basic plotting
import holoviews as hv
import bokeh
from bokeh.io import export_svg
from bokeh.models import NumeralTickFormatter

from bokeh.themes.theme import Theme

import panel as pn
pn.config.comms = "vscode"

# Making graphs
import matplotlib.pyplot as plt
import itertools
import tqdm
from multiprocessing import Pool
from operator import itemgetter

hv.extension('bokeh')

In [89]:
"""A function for parsing the mlde results"""

from __future__ import annotations

import os
import re
import itertools
from glob import glob
from tqdm import tqdm
from copy import deepcopy

import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.colors import LogNorm
import pickle

# Basic plotting
import holoviews as hv
import bokeh
from bokeh.io import export_svg
from bokeh.models import NumeralTickFormatter


import panel as pn

pn.config.comms = "vscode"

# Making graphs
import matplotlib.pyplot as plt

hv.extension("bokeh")


from SSMuLA.landscape_global import n_mut_cutoff_dict
from SSMuLA.util import checkNgen_folder, get_file_name


default_metrics = ["maxes", "means", "ndcgs", "rhos"]


class MLDEParser:
    """A class for parsing EACH mlde result npy file"""

    def __init__(
        self, 
        mlde_npy_path: str, 
        mlde_results_dir: str = "results/mlde/saved"
    ):

        """
        Args:
        - mlde_npy: str, the path to the mlde npy file
            ie. 'results/mlde/saved/none/none-double/scale2max/GB1/one-hot_boosting|ridge_sample384_top96.npy'
        - mlde_results_dir: str, the directory where the mlde results are saved

        Note:
        
        {'data_config': {'input_csv': 'results/zs_comb/none/scale2max/GB1.csv',
            'zs_predictor': 'none',
            'encoding': ['one-hot'],
            'ft_libs': [149361],
            'scale_fit': 'max',
            'filter_min_by': 'none',
            'n_mut_cutoff': 2},
            'model_config': {'model_classes': ['boosting', 'ridge']},
            'train_config': {'n_sample': [384],
            'n_splits': 5,
            'n_replicate': 100,
            'n_worker': 1,
            'global_seed': 42,
            'verbose': False,
            'save_model': False},
            'eval_config': {'n_top': 96}}

        """

        self._mlde_npy_path = mlde_npy_path
        self._mlde_results_dir = mlde_results_dir

        # get all npy keys as properties
        # should be ['config', 'top_seqs', 'maxes', 'means', 'ndcgs', 'rhos', 'unique', 'labelled', 'y_preds']
        for attr, val in self.npy_item.items():
            setattr(self, attr, val)
        
        if not hasattr(self, 'config'):
            print(f"no config found for {self._mlde_npy_path}")
            pass

        # set all config_dict keys as properties
        # should be ['data_config', 'model_config', 'train_config', 'eval_config']
        for attr, val in self.config.items():
            setattr(self, attr, val)
            for k, v in val.items():
                setattr(self, k, v)
                if isinstance(v, list):
                    setattr(self, f"{k}_len", len(v))

        self._metric_df = self._get_metric_df()    
            
    def _get_metric_df(self) -> pd.DataFrame:
        """Return the metric df"""

        # set up df for all metrics
        metric_df = pd.DataFrame(
                    {
                        "encoding": self.encoding_index,
                        "models": self.models_index,
                        "n_sample": self.n_sample_index,
                        "ft_lib": self.lib_index,
                        "repeats": self.repeats_index,
                    }
                )
        
        metric_df['encoding'] = metric_df['encoding'].map({i: v for i, v in enumerate(self.encoding)})
        metric_df['models'] = metric_df['models'].map({i: v for i, v in enumerate(self.model_classes)})
        metric_df['n_sample'] = metric_df['n_sample'].map({i: v for i, v in enumerate(self.n_sample)})
        metric_df['ft_lib'] = metric_df['ft_lib'].map({i: v for i, v in enumerate(self.ft_libs)})
        metric_df["n_mut_cutoff"] = n_mut_cutoff_dict[self.n_mut_cutoff]
        metric_df["lib"] = get_file_name(self.input_csv)
        metric_df["zs"] = self.zs_predictor
        metric_df["n_top"] = self.n_top
        
        # get all metrics as properties
        for m in default_metrics:
            m_array = getattr(self, m)
            # get rid of nan col
            try:
                metric_df[m] = m_array[:, ~np.isnan(m_array).any(axis=0)].flatten()
            except:
                print(self._mlde_npy_path, m_array.shape, m_array[:, ~np.isnan(m_array).any(axis=0)].shape)

        return metric_df

    @property
    def npy_item(self) -> dict:
        """Return the npy item"""
        return np.load(self._mlde_npy_path, allow_pickle=True).item()

    @property
    def npy_item_keys(self) -> list[str]:
        """Return the keys of the npy item"""
        return deepcopy(list(self.npy_item.keys()))

    @property
    def output_shape(self) -> tuple:

        """
        Return the shape of the output for maxes, means, ndcgs, rhos, unique, and labelled

            len(encodings),
            len(model_classes),
            len(n_samples),
            len(ft_libs),
            n_replicate,
        """

        return (
            self.encoding_len,
            self.model_classes_len,
            self.n_sample_len,
            self.ft_libs_len,
            self.n_replicate,
        )

    @property
    def top_seq_output_shape(self) -> tuple:

        """
        Return the shape of the output for top_seqs

            len(encodings),
            len(model_classes),
            len(n_samples),
            len(ft_libs),
            n_replicate,
            n_top,
        """

        return (
            self.encoding_len,
            self.model_classes_len,
            self.n_sample_len,
            self.ft_libs_len,
            self.n_replicate,
            self.n_top,
        )

    @property
    def encoding_index(self) -> np.ndarray:
        """Return the encoding index"""
        return np.array(
            [
                [i]
                * self.output_shape[1]
                * self.output_shape[2]
                * self.output_shape[3]
                * self.output_shape[4]
                for i in range(self.output_shape[0])
            ]
        ).flatten()

    @property
    def models_index(self) -> np.ndarray:
        """Return the models index"""
        return np.array(
            [
                [i] * self.output_shape[2] * self.output_shape[3] * self.output_shape[4]
                for i in range(self.output_shape[1])
                for _ in range(self.output_shape[0])
            ]
        ).flatten()

    @property
    def n_sample_index(self) -> np.ndarray:
        """Return the n_sample index"""
        return np.array(
            [
                [i] * self.output_shape[3] * self.output_shape[4]
                for i in range(self.output_shape[2])
                for _ in range(self.output_shape[1])
                for _ in range(self.output_shape[0])
            ]
        ).flatten()

    @property
    def lib_index(self) -> np.ndarray:
        """Return the lib index"""
        return np.array(
            [
                [i] * self.output_shape[4]
                for i in range(self.output_shape[3])
                for _ in range(self.output_shape[2])
                for _ in range(self.output_shape[1])
                for _ in range(self.output_shape[0])
            ]
        ).flatten()

    @property
    def repeats_index(self) -> np.ndarray:
        """Return the repeats index"""
        return np.array(
            [
                i
                for i in range(self.output_shape[4])
                for _ in range(self.output_shape[3])
                for _ in range(self.output_shape[2])
                for _ in range(self.output_shape[1])
                for _ in range(self.output_shape[0])
            ]
        )

    @property
    def metric_df(self) -> pd.DataFrame:
        """Return the metric df"""
        return self._metric_df

In [83]:
gb1_d_parser = MLDEParser(
        mlde_npy_path="results/mlde/saved/none/none-double/scale2max/GB1/one-hot_boosting|ridge_sample384_top96.npy", 
        # mlde_results_dir: str = "results/mlde/saved"
    )

maxes (1, 2, 1, 1, 100) (1, 200)
means (1, 2, 1, 1, 100) (1, 200)
ndcgs (1, 2, 1, 1, 100) (1, 200)
rhos (1, 2, 1, 1, 100) (1, 200)


In [84]:
gb1_d_parser.metric_df

Unnamed: 0,encoding,models,n_sample,ft_lib,repeats,n_mut_cutoff,lib,zs,n_top,maxes,means,ndcgs,rhos
0,one-hot,boosting,384,149361,0,double,GB1,none,96,0.614051,0.297161,0.797368,0.324864
1,one-hot,boosting,384,149361,0,double,GB1,none,96,0.658760,0.314371,0.772456,0.262550
2,one-hot,boosting,384,149361,1,double,GB1,none,96,0.723981,0.195821,0.756840,0.242435
3,one-hot,boosting,384,149361,1,double,GB1,none,96,0.598350,0.297064,0.772549,0.267467
4,one-hot,boosting,384,149361,2,double,GB1,none,96,0.741846,0.279394,0.790558,0.298536
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,one-hot,ridge,384,149361,97,double,GB1,none,96,0.918190,0.203111,0.774212,0.260462
196,one-hot,ridge,384,149361,98,double,GB1,none,96,0.658760,0.340319,0.785797,0.255417
197,one-hot,ridge,384,149361,98,double,GB1,none,96,0.825004,0.253331,0.793878,0.290284
198,one-hot,ridge,384,149361,99,double,GB1,none,96,0.918190,0.259793,0.781041,0.264912


In [91]:
def get_all_metric_df(mlde_results_dir: str = "results/mlde/saved") -> pd.DataFrame:
    """Return the metric df for all mlde results"""
    mlde_npy_paths = sorted(glob(f"{mlde_results_dir}/**/*.npy", recursive=True))
    # one-hot needs redo
    mlde_parsers = [MLDEParser(mlde_npy_path) for mlde_npy_path in tqdm(mlde_npy_paths) if "save_old_all" not in mlde_npy_path]
    return pd.concat([mlde_parser.metric_df for mlde_parser in mlde_parsers])

In [86]:
MLDEParser("results/mlde/saved/Triad_score/none-double/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy")

maxes (3, 2, 1, 3, 50) (3, 200)
means (3, 2, 1, 3, 50) (3, 200)
ndcgs (3, 2, 1, 3, 50) (3, 200)
rhos (3, 2, 1, 3, 50) (3, 200)


<__main__.MLDEParser at 0x7f6c6fe2d0c0>

In [152]:
dhfr_oh_s = MLDEParser("results/mlde_test/saved/none/none-single/scale2max/DHFR/one-hot_boosting|ridge_sample384_top96.npy")

In [153]:
dhfr_oh_s.config

{'data_config': {'input_csv': 'results/zs_comb/none/scale2max/DHFR.csv',
  'zs_predictor': 'none',
  'encoding': ['one-hot'],
  'ft_libs': [58],
  'scale_fit': 'max',
  'filter_min_by': 'none',
  'n_mut_cutoff': 1},
 'model_config': {'model_classes': ['boosting', 'ridge']},
 'train_config': {'n_sample': [384],
  'n_splits': 5,
  'n_replicate': 6,
  'boosting_n_worker': 1,
  'global_seed': 42,
  'verbose': False,
  'save_model': False},
 'eval_config': {'n_top': 96}}

In [155]:
dhfr_oh_s.rhos

array([[[[[0.60320719, 0.67373085, 0.67928259, 0.65834313, 0.67825187,
           0.68106933]]],


        [[[0.44764637, 0.54637353, 0.54318   , 0.480978  , 0.54897341,
           0.53554981]]]]])

In [117]:
dhfr_oh_s.metric_df

Unnamed: 0,encoding,models,n_sample,ft_lib,repeats,n_mut_cutoff,lib,zs,n_top,maxes,means,ndcgs,rhos
0,one-hot,boosting,384,8000,0,single,DHFR,none,96,1.0,0.477717,0.891943,0.660806
1,one-hot,boosting,384,8000,0,single,DHFR,none,96,1.0,0.477717,0.891943,0.660806
2,one-hot,boosting,384,8000,1,single,DHFR,none,96,1.0,0.477717,0.891943,0.660806
3,one-hot,boosting,384,8000,1,single,DHFR,none,96,1.0,0.477717,0.891943,0.660806
4,one-hot,boosting,384,8000,2,single,DHFR,none,96,1.0,0.477717,0.891943,0.660806
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,one-hot,ridge,384,8000,97,single,DHFR,none,96,1.0,0.501183,0.885254,0.526966
196,one-hot,ridge,384,8000,98,single,DHFR,none,96,1.0,0.501183,0.885254,0.526966
197,one-hot,ridge,384,8000,98,single,DHFR,none,96,1.0,0.501183,0.885254,0.526966
198,one-hot,ridge,384,8000,99,single,DHFR,none,96,1.0,0.501183,0.885254,0.526966


In [157]:
dhfr_s = np.load("results/mlde/saved/Triad_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy", allow_pickle=True).item()

In [159]:
dhfr_s["config"]

{'data_config': {'input_csv': 'results/zs_comb/none/scale2max/DHFR.csv',
  'zs_predictor': 'Triad_score',
  'encoding': ['esm2_t33_650M_UR50D-flatten_site',
   'esm2_t33_650M_UR50D-mean_all',
   'esm2_t33_650M_UR50D-mean_site'],
  'ft_libs': [58],
  'scale_fit': 'max',
  'filter_min_by': 'none',
  'n_mut_cutoff': 1},
 'model_config': {'model_classes': ['boosting', 'ridge']},
 'train_config': {'n_sample': [384],
  'n_splits': 5,
  'n_replicate': 50,
  'boosting_n_worker': 1,
  'global_seed': 42,
  'verbose': False,
  'save_model': False},
 'eval_config': {'n_top': 384}}

In [170]:
dhfr_s["ndcgs"][0][0][0][0]

array([0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108,
       0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108,
       0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108,
       0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108,
       0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108,
       0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108,
       0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108,
       0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108,
       0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108,
       0.91038108, 0.91038108, 0.91038108, 0.91038108, 0.91038108])

In [156]:
all_df = get_all_metric_df()
all_df

  0%|          | 0/623 [00:00<?, ?it/s]

 13%|█▎        | 80/623 [00:06<01:02,  8.68it/s]

results/mlde/saved/Triad_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_bo

 13%|█▎        | 84/623 [00:07<01:01,  8.72it/s]

results/mlde/saved/Triad_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boostin

 15%|█▍        | 93/623 [00:08<00:51, 10.32it/s]

results/mlde/saved/Triad_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-me

 18%|█▊        | 110/623 [00:08<00:19, 25.69it/s]

results/mlde/saved/Triad_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-me

 19%|█▉        | 120/623 [00:08<00:15, 32.94it/s]

results/mlde/saved/Triad_score/none-single/scale2max/TrpB3G/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3G/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3G/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3G/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB3G/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-me

 20%|██        | 125/623 [00:09<00:38, 13.07it/s]

results/mlde/saved/Triad_score/none-single/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/Triad_score/none-single/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)


 33%|███▎      | 205/623 [00:16<00:27, 15.06it/s]

results/mlde/saved/esm_score/none-double/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 5) (3, 30)
results/mlde/saved/esm_score/none-double/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 5) (3, 30)
results/mlde/saved/esm_score/none-double/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 5) (3, 30)
results/mlde/saved/esm_score/none-double/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 5) (3, 30)


 33%|███▎      | 208/623 [00:16<00:39, 10.40it/s]

results/mlde/saved/esm_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|rid

 34%|███▍      | 213/623 [00:17<00:40, 10.11it/s]

results/mlde/saved/esm_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sa

 36%|███▌      | 222/623 [00:18<00:35, 11.37it/s]

results/mlde/saved/esm_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_bo

 38%|███▊      | 237/623 [00:18<00:15, 24.55it/s]

results/mlde/saved/esm_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_bo

 40%|███▉      | 247/623 [00:18<00:11, 32.58it/s]

results/mlde/saved/esm_score/none-single/scale2max/TrpB3F/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3F/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3F/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3F/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3G/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosti

 40%|████      | 252/623 [00:18<00:10, 35.69it/s]

results/mlde/saved/esm_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/esm_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_bo

 53%|█████▎    | 331/623 [00:25<00:14, 20.69it/s]

results/mlde/saved/ev_score/none-double/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 5) (3, 30)
results/mlde/saved/ev_score/none-double/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 5) (3, 30)
results/mlde/saved/ev_score/none-double/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 5) (3, 30)
results/mlde/saved/ev_score/none-double/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 5) (3, 30)
results/mlde/saved/ev_score/none-double/scale2max/TrpB4/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sampl

 54%|█████▍    | 336/623 [00:25<00:19, 14.43it/s]

results/mlde/saved/ev_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/DHFR/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sa

 55%|█████▍    | 340/623 [00:26<00:22, 12.52it/s]

results/mlde/saved/ev_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/GB1/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample3

 56%|█████▌    | 348/623 [00:27<00:24, 11.20it/s]

results/mlde/saved/ev_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3A/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boostin

 57%|█████▋    | 358/623 [00:27<00:14, 18.71it/s]

results/mlde/saved/ev_score/none-single/scale2max/TrpB3C/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3C/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3C/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3C/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top96.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3D/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ri

 59%|█████▉    | 369/623 [00:27<00:09, 27.84it/s]

results/mlde/saved/ev_score/none-single/scale2max/TrpB3F/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3F/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3F/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3F/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3F/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boostin

 61%|██████    | 379/623 [00:28<00:06, 35.53it/s]

results/mlde/saved/ev_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boosting|ridge_sample384_top384.npy (3, 2, 1, 3, 50) (3, 300)
results/mlde/saved/ev_score/none-single/scale2max/TrpB3I/esm2_t33_650M_UR50D-flatten_site|esm2_t33_650M_UR50D-mean_all|esm2_t33_650M_UR50D-mean_site_boostin

100%|██████████| 623/623 [00:33<00:00, 18.74it/s]


Unnamed: 0,encoding,models,n_sample,ft_lib,repeats,n_mut_cutoff,lib,zs,n_top,maxes,means,ndcgs,rhos
0,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,0,all,DHFR,Triad_score,384,1.000000,0.502199,0.921660,0.792539
1,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,0,all,DHFR,Triad_score,384,1.000000,0.468277,0.918007,0.737994
2,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,0,all,DHFR,Triad_score,384,0.996537,0.486061,0.915259,0.801123
3,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,0,all,DHFR,Triad_score,384,1.000000,0.449828,0.912758,0.655603
4,esm2_t33_650M_UR50D-flatten_site,boosting,384,4000,0,all,DHFR,Triad_score,384,1.000000,0.480492,0.928671,0.781353
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,one-hot,ridge,384,159129,97,single,TrpB4,none,96,0.749107,0.376384,0.980014,0.321193
196,one-hot,ridge,384,159129,98,single,TrpB4,none,96,0.749107,0.376384,0.980014,0.321193
197,one-hot,ridge,384,159129,98,single,TrpB4,none,96,0.749107,0.376384,0.980014,0.321193
198,one-hot,ridge,384,159129,99,single,TrpB4,none,96,0.749107,0.376384,0.980014,0.321193


In [135]:
all_df[
        (all_df["zs"] == "none")
        & (all_df["encoding"] != "one-hot")
        & (all_df["models"] == "boosting")
        & (all_df["n_top"] == 96)
    ].drop_duplicates()

Unnamed: 0,encoding,models,n_sample,ft_lib,repeats,n_mut_cutoff,lib,zs,n_top,maxes,means,ndcgs,rhos
0,esm2_t33_650M_UR50D-flatten_site,boosting,384,8000,0,all,DHFR,none,96,1.000000,0.600809,0.938639,0.806137
1,esm2_t33_650M_UR50D-flatten_site,boosting,384,8000,0,all,DHFR,none,96,1.000000,0.625357,0.939710,0.827160
2,esm2_t33_650M_UR50D-flatten_site,boosting,384,8000,0,all,DHFR,none,96,1.000000,0.621703,0.944101,0.847372
3,esm2_t33_650M_UR50D-flatten_site,boosting,384,8000,0,all,DHFR,none,96,1.000000,0.655619,0.955815,0.861880
4,esm2_t33_650M_UR50D-flatten_site,boosting,384,8000,0,all,DHFR,none,96,1.000000,0.608112,0.946087,0.833112
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,esm2_t33_650M_UR50D-mean_all,boosting,384,159129,20,single,TrpB4,none,96,0.819071,0.258287,0.977343,0.296025
126,esm2_t33_650M_UR50D-mean_all,boosting,384,159129,21,single,TrpB4,none,96,0.819071,0.258287,0.977343,0.296025
132,esm2_t33_650M_UR50D-mean_all,boosting,384,159129,22,single,TrpB4,none,96,0.819071,0.258287,0.977343,0.296025
138,esm2_t33_650M_UR50D-mean_all,boosting,384,159129,23,single,TrpB4,none,96,0.819071,0.258287,0.977343,0.296025


In [136]:
all_df[
        (all_df["zs"] == "Triad_score")
        & (all_df["encoding"] == "one-hot")
        & (all_df["models"] == "boosting")
        & (all_df["n_top"] == 96)
    ]

Unnamed: 0,encoding,models,n_sample,ft_lib,repeats,n_mut_cutoff,lib,zs,n_top,maxes,means,ndcgs,rhos
0,one-hot,boosting,384,4000,0,all,DHFR,Triad_score,96,0.982530,0.573868,0.934276,0.821314
1,one-hot,boosting,384,4000,0,all,DHFR,Triad_score,96,1.000000,0.514007,0.914121,0.790259
2,one-hot,boosting,384,4000,0,all,DHFR,Triad_score,96,0.996537,0.445766,0.899621,0.783071
3,one-hot,boosting,384,4000,0,all,DHFR,Triad_score,96,0.865538,0.567197,0.919689,0.805907
4,one-hot,boosting,384,4000,0,all,DHFR,Triad_score,96,1.000000,0.565103,0.915818,0.786375
...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,one-hot,boosting,384,77,49,single,TrpB4,Triad_score,96,0.752895,0.501260,0.982069,0.342211
296,one-hot,boosting,384,77,49,single,TrpB4,Triad_score,96,0.752895,0.501260,0.982069,0.342211
297,one-hot,boosting,384,77,49,single,TrpB4,Triad_score,96,0.752895,0.501260,0.982069,0.342211
298,one-hot,boosting,384,77,49,single,TrpB4,Triad_score,96,0.752895,0.501260,0.982069,0.342211


In [134]:
hv.Violin(
    all_df[
        (all_df["zs"] == "none")
        & (all_df["encoding"] == "one-hot")
        & (all_df["models"] == "boosting")
        & (all_df["n_top"] == 96)
    ].copy(),
    kdims=["lib", "n_mut_cutoff"],
    vdims=["rhos"],
).opts(
    width=1200,
    height=400,
    # violin_color=hv.dim("n_mut_cuttoff").str()
    # hooks=[one_decimal_x, one_decimal_y, fixmargins]
)

### Let's do just onehot s2m and d2m no zs

In [4]:
from glob import glob

In [11]:
mlde_d_onehot_96 = sorted(glob("results/mlde/saved/none/none-double/scale2max/*/one-hot*96.npy"))
mlde_d_onehot_96

['results/mlde/saved/none/none-double/scale2max/DHFR/one-hot_boosting|ridge_sample384_top96.npy',
 'results/mlde/saved/none/none-double/scale2max/GB1/one-hot_boosting|ridge_sample384_top96.npy',
 'results/mlde/saved/none/none-double/scale2max/TrpB3A/one-hot_boosting|ridge_sample384_top96.npy',
 'results/mlde/saved/none/none-double/scale2max/TrpB3B/one-hot_boosting|ridge_sample384_top96.npy',
 'results/mlde/saved/none/none-double/scale2max/TrpB3C/one-hot_boosting|ridge_sample384_top96.npy',
 'results/mlde/saved/none/none-double/scale2max/TrpB3D/one-hot_boosting|ridge_sample384_top96.npy',
 'results/mlde/saved/none/none-double/scale2max/TrpB3E/one-hot_boosting|ridge_sample384_top96.npy',
 'results/mlde/saved/none/none-double/scale2max/TrpB3F/one-hot_boosting|ridge_sample384_top96.npy',
 'results/mlde/saved/none/none-double/scale2max/TrpB3G/one-hot_boosting|ridge_sample384_top96.npy',
 'results/mlde/saved/none/none-double/scale2max/TrpB3H/one-hot_boosting|ridge_sample384_top96.npy',
 'res

In [32]:
lib_dets = "double"

In [13]:
gb1_d_onehot_96 = np.load("results/mlde/saved/none/none-double/scale2max/GB1/one-hot_boosting|ridge_sample384_top96.npy", allow_pickle=True)
gb1_d_onehot_96

array({'config': {'data_config': {'input_csv': 'results/zs_comb/none/scale2max/GB1.csv', 'zs_predictor': 'none', 'encoding': ['one-hot'], 'ft_libs': [149361], 'scale_fit': 'max', 'filter_min_by': 'none', 'n_mut_cutoff': 2}, 'model_config': {'model_classes': ['boosting', 'ridge']}, 'train_config': {'n_sample': [384], 'n_splits': 5, 'n_replicate': 100, 'n_worker': 1, 'global_seed': 42, 'verbose': False, 'save_model': False}, 'eval_config': {'n_top': 96}}, 'top_seqs': array([[[[[['FFGM', 'IFGM', 'TFGM', ..., 'NYGM', 'TWGL', 'TWGV'],
           ['LWGF', 'FFGM', 'MFGM', ..., 'FFGW', 'IYGI', 'FYGQ'],
           ['LWGF', 'VWGF', 'TWGF', ..., 'WWFH', 'LWSF', 'LWRF'],
           ...,
           ['LYGF', 'LYGM', 'LYGA', ..., 'TYFG', 'TYGM', 'TYMG'],
           ['CYGL', 'IYGL', 'CYGC', ..., 'CHGV', 'CWAG', 'LYAG'],
           ['TWCA', 'LHCA', 'VYGC', ..., 'LPCA', 'PDCA', 'TRCA']]]],



        [[[['LWGC', 'LWGM', 'LFGC', ..., 'LHAC', 'RWGM', 'LHAM'],
           ['LFGC', 'LFGF', 'LYGC', ..., 'CYGC

In [14]:
gb1_d_onehot_96.item().keys()

dict_keys(['config', 'top_seqs', 'maxes', 'means', 'ndcgs', 'rhos', 'unique', 'labelled', 'y_preds'])

In [47]:
gb1_d_onehot_96.item()["unique"]

array([[[[[384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.]]],


        [[[384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
           384., 384., 384., 384., 384., 384., 384., 384., 384., 384.,
 

In [19]:
gb1_d_onehot_96.item()["config"]

{'data_config': {'input_csv': 'results/zs_comb/none/scale2max/GB1.csv',
  'zs_predictor': 'none',
  'encoding': ['one-hot'],
  'ft_libs': [149361],
  'scale_fit': 'max',
  'filter_min_by': 'none',
  'n_mut_cutoff': 2},
 'model_config': {'model_classes': ['boosting', 'ridge']},
 'train_config': {'n_sample': [384],
  'n_splits': 5,
  'n_replicate': 100,
  'n_worker': 1,
  'global_seed': 42,
  'verbose': False,
  'save_model': False},
 'eval_config': {'n_top': 96}}

In [15]:
gb1_d_onehot_96.item()["maxes"].shape, gb1_d_onehot_96.item()["means"].shape, gb1_d_onehot_96.item()["top_seqs"].shape, gb1_d_onehot_96.item()["y_preds"].shape

((1, 2, 1, 1, 100),
 (1, 2, 1, 1, 100),
 (1, 2, 1, 1, 100, 96),
 (1, 2, 1, 1, 100, 149361))

In [49]:
gb1_d_onehot_96.item().keys()

dict_keys(['config', 'top_seqs', 'maxes', 'means', 'ndcgs', 'rhos', 'unique', 'labelled', 'y_preds'])

In [50]:
gb1_d_onehot_96.item()["unique"].shape

(1, 2, 1, 1, 100)

In [51]:
gb1_d_onehot_96.item()["labelled"].shape

(1, 2, 1, 1, 100)

In [17]:
gb1_d_onehot_96.item()["maxes"][0][0].flatten().shape

(100,)

In [16]:
gb1_d_onehot_96.item()["maxes"][0][1]

array([[[0.68791511, 0.65875997, 0.65875997, 0.62093539, 0.59681201,
         0.68958016, 1.        , 0.68958016, 0.64126117, 0.91819031,
         0.65875997, 0.64660666, 0.51195701, 0.69890765, 0.91819031,
         0.65875997, 0.62093539, 0.91819031, 0.79071951, 0.82500438,
         0.91819031, 0.64126117, 1.        , 0.62093539, 0.65875997,
         0.61524237, 0.59835001, 0.82500438, 0.65875997, 0.91819031,
         0.62093539, 0.82500438, 0.68791511, 0.62093539, 0.82500438,
         0.59835001, 0.82500438, 0.62712818, 0.62712818, 1.        ,
         0.91819031, 0.91819031, 0.62093539, 0.82500438, 0.59053442,
         0.59681201, 0.82500438, 0.59835001, 0.61524237, 0.62093539,
         0.65875997, 0.69890765, 0.69890765, 0.65875997, 0.59681201,
         1.        , 0.69890765, 0.62093539, 0.65489649, 0.91819031,
         0.59835001, 0.62712818, 0.862211  , 0.51195701, 0.91819031,
         0.65875997, 0.64126117, 0.62712818, 0.79071951, 0.65875997,
         0.62093539, 0.61524237, 0

In [17]:
len(gb1_d_onehot_96.item()["top_seqs"][0][0][0][0][0])

96

In [18]:
gb1_d_onehot_96.item()["top_seqs"][0][0][0][0][0]

array(['FFGM', 'IFGM', 'TFGM', 'MFGM', 'LFGM', 'FYGC', 'LYGC', 'CFGM',
       'FYGM', 'MYGC', 'RFGM', 'YFGM', 'EFGM', 'KFGM', 'TYGC', 'QFGM',
       'VFGM', 'LWGF', 'IYGC', 'HFGM', 'WFGM', 'VYGC', 'LWGA', 'LYGM',
       'SFGM', 'AFGM', 'MYGM', 'NFGM', 'TWGA', 'CYGC', 'LWGM', 'TWGF',
       'FWGM', 'RYGC', 'YYGC', 'EYGC', 'GFGM', 'TWGM', 'KYGC', 'QYGC',
       'WYGC', 'LWGN', 'WFFM', 'DFGM', 'LWGS', 'TYGM', 'LWGI', 'LWGC',
       'TWGC', 'IFGC', 'AYGC', 'SYGC', 'IFGA', 'LWGL', 'LWGT', 'FWGF',
       'NYGC', 'VYGM', 'IFGF', 'CYGM', 'LWGV', 'LWGG', 'LWGW', 'FWGA',
       'IYGM', 'RYGM', 'YYGM', 'EYGM', 'KYGM', 'PYGC', 'LWGD', 'QYGM',
       'LWGY', 'HYGC', 'DYGC', 'WYGM', 'HYGM', 'FWGC', 'GYGC', 'IFGI',
       'TWGS', 'TFGC', 'TFGA', 'MHGA', 'LWGH', 'AYGM', 'SYGM', 'TWGI',
       'LWGQ', 'TFGF', 'FWGS', 'WYFM', 'MHGF', 'NYGM', 'TWGL', 'TWGV'],
      dtype='<U4')

In [48]:
arr.shape

(1, 2, 1, 1, 100)

In [36]:
# get 'maxes', 'means', 'ndcgs', 'rhos' to df

metrics = ["maxes", "means", "ndcgs", "rhos"]

arr = gb1_d_onehot_96.item()["maxes"].copy()
# Flatten the array
flattened_arr = arr.flatten()

# Create index arrays for each dimension
encoding_index = np.array([[i] * arr.shape[1] * arr.shape[2] * arr.shape[3] * arr.shape[4] for i in range(arr.shape[0])]).flatten()
models_index = np.array([[i] * arr.shape[2] * arr.shape[3] * arr.shape[4] for i in range(arr.shape[1]) for _ in range(arr.shape[0])]).flatten()
n_sample_index = np.array([[i] * arr.shape[3] * arr.shape[4] for i in range(arr.shape[2]) for _ in range(arr.shape[1]) for _ in range(arr.shape[0])]).flatten()
lib_index = np.array([[i] * arr.shape[4] for i in range(arr.shape[3]) for _ in range(arr.shape[2]) for _ in range(arr.shape[1]) for _ in range(arr.shape[0])]).flatten()
repeats_index = np.array([i for i in range(arr.shape[4]) for _ in range(arr.shape[3]) for _ in range(arr.shape[2]) for _ in range(arr.shape[1]) for _ in range(arr.shape[0])])

# Create DataFrame
df = pd.DataFrame({
    'Encoding': encoding_index,
    'Models': models_index,
    'N_Sample': n_sample_index,
    'Lib': lib_index,
    'Repeats': repeats_index,
    'maxes': flattened_arr
})

for m in metrics[1:]:
    arr = gb1_d_onehot_96.item()[m].copy()
    # Flatten the array
    flattened_arr = arr.flatten()

    # Create DataFrame
    df[m] = flattened_arr

In [37]:
df

Unnamed: 0,Encoding,Models,N_Sample,Lib,Repeats,maxes,means,ndcgs,rhos
0,0,0,0,0,0,0.614051,0.297161,0.797368,0.324864
1,0,0,0,0,0,0.658760,0.314371,0.772456,0.262550
2,0,0,0,0,1,0.723981,0.195821,0.756840,0.242435
3,0,0,0,0,1,0.598350,0.297064,0.772549,0.267467
4,0,0,0,0,2,0.741846,0.279394,0.790558,0.298536
...,...,...,...,...,...,...,...,...,...
195,0,1,0,0,97,0.918190,0.203111,0.774212,0.260462
196,0,1,0,0,98,0.658760,0.340319,0.785797,0.255417
197,0,1,0,0,98,0.825004,0.253331,0.793878,0.290284
198,0,1,0,0,99,0.918190,0.259793,0.781041,0.264912


In [38]:
gb1_d_onehot_96_config = gb1_d_onehot_96.item()["config"]
gb1_d_onehot_96_config

{'data_config': {'input_csv': 'results/zs_comb/none/scale2max/GB1.csv',
  'zs_predictor': 'none',
  'encoding': ['one-hot'],
  'ft_libs': [149361],
  'scale_fit': 'max',
  'filter_min_by': 'none',
  'n_mut_cutoff': 2},
 'model_config': {'model_classes': ['boosting', 'ridge']},
 'train_config': {'n_sample': [384],
  'n_splits': 5,
  'n_replicate': 100,
  'n_worker': 1,
  'global_seed': 42,
  'verbose': False,
  'save_model': False},
 'eval_config': {'n_top': 96}}

In [39]:
# Map integer indices to labels

df['Encoding'] = df['Encoding'].map({i: v for i, v in enumerate(gb1_d_onehot_96_config["data_config"]["encoding"])})
df['Models'] = df['Models'].map({i: v for i, v in enumerate(gb1_d_onehot_96_config["model_config"]["model_classes"])})
df['N_Sample'] = df['N_Sample'].map({i: v for i, v in enumerate(gb1_d_onehot_96_config["train_config"]["n_sample"])})
# df['Lib'] = df['Lib'].map({i: v for i, v in enumerate(gb1_d_onehot_96_config["data_config"]["ft_libs"])})
df["Libs"] = lib_dets
df


Unnamed: 0,Encoding,Models,N_Sample,Lib,Repeats,maxes,means,ndcgs,rhos
0,one-hot,boosting,384,double,0,0.614051,0.297161,0.797368,0.324864
1,one-hot,boosting,384,double,0,0.658760,0.314371,0.772456,0.262550
2,one-hot,boosting,384,double,1,0.723981,0.195821,0.756840,0.242435
3,one-hot,boosting,384,double,1,0.598350,0.297064,0.772549,0.267467
4,one-hot,boosting,384,double,2,0.741846,0.279394,0.790558,0.298536
...,...,...,...,...,...,...,...,...,...
195,one-hot,ridge,384,double,97,0.918190,0.203111,0.774212,0.260462
196,one-hot,ridge,384,double,98,0.658760,0.340319,0.785797,0.255417
197,one-hot,ridge,384,double,98,0.825004,0.253331,0.793878,0.290284
198,one-hot,ridge,384,double,99,0.918190,0.259793,0.781041,0.264912


In [None]:
df

In [40]:
#MLDE violin plots
# Plot Hooks
def one_decimal_x(plot,element):
    plot.handles['plot'].xaxis[0].formatter = NumeralTickFormatter(format="0.0")

def one_decimal_y(plot,element):
    plot.handles['plot'].yaxis[0].formatter = NumeralTickFormatter(format="0.0")

def fixmargins(plot,element):
    plot.handles['plot'].min_border_right=30
    plot.handles['plot'].min_border_left=65
    plot.handles['plot'].min_border_top=20
    plot.handles['plot'].min_border_bottom=65
    plot.handles['plot'].outline_line_color='black'
    plot.handles['plot'].outline_line_alpha=1
    plot.handles['plot'].outline_line_width=1
    plot.handles['plot'].toolbar.autohide = True
    

In [45]:
hv.Violin(df, kdims=["Models", "Lib"], vdims=["rhos"])# .opts(hooks=[one_decimal_x, one_decimal_y, fixmargins])

In [46]:
hv.Violin(df, kdims=["Models", "Lib"], vdims=["ndcgs"])# .opts(hooks=[one_decimal_x, one_decimal_y, fixmargins])

In [None]:
# plot all doubles and singles with boht models and all libraries

In [22]:
GB1_tri.item()["maxes"][0][0].shape

(1, 3, 100)

In [23]:
GB1_tri.item()["maxes"][0][0][0][0]

array([0.99653688, 0.99653688, 0.86553772, 1.        , 0.99653688,
       1.        , 1.        , 1.        , 1.        , 0.99653688,
       1.        , 0.86553772, 0.87359575, 0.99653688, 0.99653688,
       1.        , 1.        , 0.86553772, 0.99653688, 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.95900307,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.85784747, 1.        , 0.99653688, 1.        ,
       0.87359575, 1.        , 0.99653688, 1.        , 0.99653688,
       1.        , 0.98253018, 1.        , 0.85784747, 1.        ,
       1.        , 0.99653688, 1.        , 0.86553772, 0.86553772,
       1.        , 0.99653688, 0.99653688, 0.86553772, 1.        ,
       1.        , 0.99653688, 0.99653688, 1.        , 0.98555511,
       0.95900307, 1.        , 1.        , 1.        , 0.87359575,
       1.        , 1.        , 1.        , 0.98253018, 1.        ,
       1.        , 0.86553772, 1.        , 1.        , 1.     

In [24]:
GB1_tri.item()["maxes"][0][0][0][1]

array([0.99653688, 0.99653688, 0.86553772, 1.        , 0.99653688,
       1.        , 1.        , 1.        , 1.        , 0.99653688,
       1.        , 0.86553772, 0.87359575, 0.99653688, 0.99653688,
       1.        , 1.        , 0.86553772, 0.99653688, 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.95900307,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.85784747, 1.        , 0.99653688, 1.        ,
       0.87359575, 1.        , 0.99653688, 1.        , 0.99653688,
       1.        , 0.98253018, 1.        , 0.85784747, 1.        ,
       1.        , 0.99653688, 1.        , 0.86553772, 0.86553772,
       1.        , 0.99653688, 0.99653688, 0.86553772, 1.        ,
       1.        , 0.99653688, 0.99653688, 1.        , 0.98555511,
       0.95900307, 1.        , 1.        , 1.        , 0.87359575,
       1.        , 1.        , 1.        , 0.98253018, 1.        ,
       1.        , 0.86553772, 1.        , 1.        , 1.     

In [25]:
GB1_tri.item()["maxes"][0][0][0][2]

array([0.99653688, 0.99653688, 0.86553772, 1.        , 0.99653688,
       1.        , 1.        , 1.        , 1.        , 0.99653688,
       1.        , 0.86553772, 0.87359575, 0.99653688, 0.99653688,
       1.        , 1.        , 0.86553772, 0.99653688, 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.95900307,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.85784747, 1.        , 0.99653688, 1.        ,
       0.87359575, 1.        , 0.99653688, 1.        , 0.99653688,
       1.        , 0.98253018, 1.        , 0.85784747, 1.        ,
       1.        , 0.99653688, 1.        , 0.86553772, 0.86553772,
       1.        , 0.99653688, 0.99653688, 0.86553772, 1.        ,
       1.        , 0.99653688, 0.99653688, 1.        , 0.98555511,
       0.95900307, 1.        , 1.        , 1.        , 0.87359575,
       1.        , 1.        , 1.        , 0.98253018, 1.        ,
       1.        , 0.86553772, 1.        , 1.        , 1.     

In [8]:
gb1_d2m = np.load("results/mlde/saved/esm_score/none-single/scale2max/DHFR/one-hot_boosting|ridge_sample384_top96.npy", allow_pickle=True)
gb1_d2m.item().keys()

dict_keys(['top_seqs', 'maxes', 'means', 'ndcgs', 'unique', 'labelled', 'y_preds'])

In [10]:
gb1_d2m.item()["maxes"].shape, gb1_d2m.item()["top_seqs"].shape, gb1_d2m.item()["y_preds"].shape

((1, 2, 1, 3, 100), (1, 2, 1, 3, 100, 96), (1, 2, 1, 3, 100, 8000))

In [None]:
# model

In [43]:

data = pd.DataFrame.from_dict(maxes2)
data = data.melt(var_name='Landscape', value_name='max_fitness')

MLDE_fig = hv.Violin(
    data.sort_index(ascending=False), ['Landscape'], 'max_fitness'
).opts(
    # split='protein',
    frame_height=300,
    frame_width=300,
    violin_width=0.8,
    fontscale=1.3,
    hooks=[fixmargins],
    # show_legend=True,
    # inner=None,
    cut = 0,
    # legend_position='top',
    ylabel='Max Fitness Achieved',
    violin_color=hv.dim('Landscape').str(),
    cmap = 'Category10',
    xlabel = 'Protein',
    ylim = (0,1)
)

MLDE_fig

In [44]:
data

Unnamed: 0,Landscape,max_fitness
0,GB1,0.796777
1,GB1,0.614051
2,GB1,0.526876
3,GB1,0.741846
4,GB1,0.475515
...,...,...
345,GB1,0.918190
346,GB1,1.000000
347,GB1,1.000000
348,GB1,0.862211
