In [1]:
%cd ~/alde4ssmula

/disk2/fli/alde4ssmula


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import torch
from datetime import datetime
import glob
import os
import math
import pandas as pd

In [4]:
def index2regret(indices, y):
    """
    Converts list of queried indices to regret (difference between the max value in the deisgn space and the max queired value)
    """
    indices = np.array(indices, dtype=int)
    regret = torch.zeros((indices.shape[0], indices.shape[1]))
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            regret[i, j] = 1 - y[indices[i, :j+1]].max()
    return regret

In [5]:
def load_tensors(subdir):
    """
    loads al indices from a directory and converts them to regret values
    """
    tensors = sorted(glob.glob(subdir + '/*indices*.pt'))
        
    tests = {}
    print('Models not included/not over budget yet:\n')

    for tensor in tensors:
        num = int(tensor.split('indices')[0].split('_')[-1])

        if '.pt' in tensor and 'state_dict' not in tensor:
            first = False
            if "Random" not in os.path.basename(tensor):
                nm = os.path.basename(tensor).split('_')[0] + '_' + os.path.basename(tensor).split('_')[1]
            else:     
                nm = os.path.basename(tensor).split('_')[0]

            t = torch.load(tensor).cpu().detach()
            t = torch.reshape(t, (1, -1))

            # lim = 480
            # if t.size(-1) < lim:
            #     if 'indices.pt' in tensor:
            #         print(tensor.split('/')[-1])
            #     continue
            if nm in tests.keys():
                d = tests[nm]
            else:
                d = {}
                tests[nm] = d

            dtype = os.path.basename(tensor).split('_')[-1].split('.')[0]
            dtype = ''.join([i for i in dtype if not i.isdigit()])
            if dtype in d.keys():
                arr = d[dtype]
            else:
                arr = t
                d[dtype] = arr
                first = True
            if first:
                pass
            elif t.size(-1) < arr.size(-1):
                arr = torch.cat((arr[:,:t.size(-1)], t), 0)
            else:
                arr = torch.cat((arr, t[:,:arr.size(-1)]), 0)
            d[dtype] = arr
            tests[nm] = d

    print('\nModels included:\n')
    batch = {}
    budget, total= math.inf, math.inf
    for key in tests.keys():
        print(key)
        num_runs = -1
        for dtype in tests[key].keys():
            t = tests[key][dtype]
            t = index2regret(t, y)
            
            if 'indices' == dtype and t.size(-1) != 0:
                if t.size(-1) < budget:
                    budget = t.size(-1)
                num_runs = t.size(0)
            elif 'y' in dtype and t.size(-1) < total and t.size(-1) != 0:
                total = t.size(-1)
            
            sd, mean = torch.std_mean(t, 0)
            sem = sd / (t.size(0)**.5)
            if dtype in batch.keys():
                d = batch[dtype]
            else:
                d = {}
                batch[dtype] = d

            d[key] = (mean, sem)

            batch[dtype] = d
        print("Runs: {}".format(num_runs))
    print(batch.keys())
    print('Budget: {}'.format(budget))
    print('Total queries (incl. init): {}'.format(budget))

    return batch, budget

In [6]:
def tabulate_regret(df, tests, budget, subdir, randregret=None):
    """
    Tabulates loaded regret values into an organized dataframe.
    """
    names = []
    queries = np.arange(budget) + 1
    
    for name in sorted(tests.keys()):
        names.append(name)

        mean, sem = tests[name]
        mean = 1 - mean
        if mean.size(0) < budget:
            names.pop()
            continue
        if mean.size(0) > budget:
            mean = mean[:budget]
            sem = sem[:budget]

        protein = subdir.split('/')[-2]
        encoding = subdir.split('/')[-1]

        if 'Random' in name:
            encoding = 'Random'
            model = 'Random'
            acquisition = 'Random'
        else:
            names = name.split('-')
            model = names[0]
            acquisition = names[-2]

        for timestep, single_mean, single_std in zip(queries, np.array(mean), np.array(sem)):
            df.loc[len(df.index)] = [protein, encoding, model, acquisition, timestep, single_mean, single_std]
    return df

In [7]:
# for res_dir in (glob.glob('results/all_*')):
for res_dir in [
    "/disk2/fli/alde4ssmula/results/4eq_96"
]:
    print(res_dir)

    #Loop over different datasets and encodings
    #tabulate all max fitness values into one dataframe
    df = pd.DataFrame(columns=['Protein', 'Encoding', 'Model', 'Acquisition', 'Timestep', 'Mean', 'Std'])

    for protein in ["DHFR", "GB1", "ParD2", "ParD3", "TrpB3A", "TrpB3B", "TrpB3C", "TrpB3D", "TrpB3E", "TrpB3F", "TrpB3G", "TrpB3H", "TrpB3I", "TrpB4"]:
        for encoding in ['onehot']:
            subdir =  res_dir + '/' + protein + '/' + encoding
            fitness_df = pd.read_csv('data/' + protein + '/fitness.csv')
            y = fitness_df['fitness'].values
            y = y/y.max()

            print(subdir)
        
            batch, budget = load_tensors(subdir)
            
            df = tabulate_regret(df, batch['indices'], budget, subdir, randregret=None)

    #clean up the results and save
    df = df.drop_duplicates(subset=['Protein', 'Encoding', 'Model', 'Acquisition', 'Timestep'], keep='first')
    df['Model'] = df['Model'].replace('BOOSTING_ENSEMBLE', 'Boosting Ensemble')
    df['Model'] = df['Model'].replace('GP_BOTORCH', 'GP')
    df['Model'] = df['Model'].replace('DNN_ENSEMBLE', 'DNN Ensemble')
    df['Model'] = df['Model'].replace('DKL_BOTORCH', 'DKL')
    df['Acquisition'] = df['Acquisition'].replace('Random', 'GREEDY')
    df.to_csv(res_dir + '/all_results.csv', index=False)

    del df

/disk2/fli/alde4ssmula/results/4eq_96
/disk2/fli/alde4ssmula/results/4eq_96/DHFR/onehot
Models not included/not over budget yet:


Models included:

BOOSTING_ENSEMBLE-DO-0-RBF-GREEDY-[60, 1]
Runs: 50
BOOSTING_ENSEMBLE-DO-0-RBF-TS-[60, 1]
Runs: 50
BOOSTING_ENSEMBLE-DO-0-RBF-UCB-[60, 1]
Runs: 50
DKL_BOTORCH-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
DKL_BOTORCH-DO-0-RBF-TS-[30, 1]
Runs: 50
DKL_BOTORCH-DO-0-RBF-UCB-[30, 1]
Runs: 50
DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
DNN_ENSEMBLE-DO-0-RBF-TS-[30, 1]
Runs: 50
DNN_ENSEMBLE-DO-0-RBF-UCB-[30, 1]
Runs: 50
GP_BOTORCH-DO-0-RBF-GREEDY-[60, 1]
Runs: 50
GP_BOTORCH-DO-0-RBF-TS-[60, 1]
Runs: 50
GP_BOTORCH-DO-0-RBF-UCB-[60, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 384
Total queries (incl. init): 384
/disk2/fli/alde4ssmula/results/4eq_96/GB1/onehot
Models not included/not over budget yet:


Models included:

BOOSTING_ENSEMBLE-DO-0-RBF-GREEDY-[80, 1]
Runs: 50
BOOSTING_ENSEMBLE-DO-0-RBF-TS-[80, 1]
Runs: 50
BOOSTING_ENSEMBLE-DO-0-RBF-UCB

In [8]:
df = pd.read_csv("/disk2/fli/alde4ssmula/results/4eq_96/all_results.csv")

In [11]:
df[(df["Encoding"]=="onehot") & (df["Acquisition"] == "GREEDY") & (df["Model"] == "Boosting Ensemble") & (df["Timestep"] == 384)]

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
383,DHFR,onehot,Boosting Ensemble,GREEDY,384,0.934166,0.009495
5375,GB1,onehot,Boosting Ensemble,GREEDY,384,0.768671,0.017338
10367,ParD2,onehot,Boosting Ensemble,GREEDY,384,0.999476,0.000296
15359,ParD3,onehot,Boosting Ensemble,GREEDY,384,0.996783,0.000778
20351,TrpB3A,onehot,Boosting Ensemble,GREEDY,384,0.647764,0.057646
25343,TrpB3B,onehot,Boosting Ensemble,GREEDY,384,0.235764,0.041905
30335,TrpB3C,onehot,Boosting Ensemble,GREEDY,384,0.364851,0.020377
35327,TrpB3D,onehot,Boosting Ensemble,GREEDY,384,0.963345,0.011131
40319,TrpB3E,onehot,Boosting Ensemble,GREEDY,384,0.478813,0.063434
45311,TrpB3F,onehot,Boosting Ensemble,GREEDY,384,0.949399,0.028288


In [12]:
df[(df["Encoding"]=="onehot") & (df["Acquisition"] == "GREEDY") & (df["Model"] == "DNN Ensemble") & (df["Timestep"] == 384)]

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
2687,DHFR,onehot,DNN Ensemble,GREEDY,384,1.0,0.0
7679,GB1,onehot,DNN Ensemble,GREEDY,384,0.895982,0.016706
12671,ParD2,onehot,DNN Ensemble,GREEDY,384,0.997718,0.000789
17663,ParD3,onehot,DNN Ensemble,GREEDY,384,0.99079,0.000757
22655,TrpB3A,onehot,DNN Ensemble,GREEDY,384,0.667309,0.056693
27647,TrpB3B,onehot,DNN Ensemble,GREEDY,384,0.192461,0.038734
32639,TrpB3C,onehot,DNN Ensemble,GREEDY,384,0.354706,0.025168
37631,TrpB3D,onehot,DNN Ensemble,GREEDY,384,0.986773,0.005237
42623,TrpB3E,onehot,DNN Ensemble,GREEDY,384,0.550468,0.062698
47615,TrpB3F,onehot,DNN Ensemble,GREEDY,384,0.960713,0.022858


In [13]:
df[(df["Encoding"]=="onehot") & (df["Acquisition"] == "GREEDY") & (df["Model"] == "DNN Ensemble") & (df["Timestep"] == 480)]

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
3359,DHFR,onehot,DNN Ensemble,GREEDY,480,0.968165,0.008206
9599,GB1,onehot,DNN Ensemble,GREEDY,480,0.750532,0.018383
15839,ParD2,onehot,DNN Ensemble,GREEDY,480,1.0,0.0
22079,ParD3,onehot,DNN Ensemble,GREEDY,480,0.989638,0.000567
28319,TrpB3A,onehot,DNN Ensemble,GREEDY,480,0.633399,0.057154
34559,TrpB3B,onehot,DNN Ensemble,GREEDY,480,0.354836,0.056066
40799,TrpB3C,onehot,DNN Ensemble,GREEDY,480,0.384993,0.03052
47039,TrpB3D,onehot,DNN Ensemble,GREEDY,480,0.960084,0.008563
53279,TrpB3E,onehot,DNN Ensemble,GREEDY,480,0.725422,0.056154
59519,TrpB3F,onehot,DNN Ensemble,GREEDY,480,0.960391,0.022018


In [21]:

def checkNgen_folder(folder_path: str) -> str:

    """
    Check if the folder and its subfolder exists
    create a new directory if not
    Args:
    - folder_path: str, the folder path
    """

    split_list = os.path.normpath(folder_path).split("/")
    
    # check if absolute
    if os.path.isabs(folder_path):
        split_list[0] = "/" + split_list[0]

    for p, _ in enumerate(split_list):
        subfolder_path = "/".join(split_list[: p + 1])
        if not os.path.exists(subfolder_path):
            print(f"Making {subfolder_path} ...")
            os.mkdir(subfolder_path)
    return folder_path
def preprocessing_data(path_pattern: str = 'data_original/*/*.csv') -> None:
    for file_path in glob.glob(path_pattern):
        output_path = file_path.replace('data_original', 'data')
        checkNgen_folder(os.path.dirname(output_path))

        df = pd.read_csv(file_path)
        # get rid of rows with * in the name of AAs
        df = df[~df['AAs'].str.contains('\*')].copy()
        df = df.rename(columns={'AAs': 'Combo'})
        df[["Combo", "fitness"]].copy().to_csv(output_path, index=False)
        print(f'Processed {file_path}')

In [22]:
preprocessing_data()

Making data ...
Making data/TrpB4 ...
Processed data_original/TrpB4/fitness.csv
Making data/ParD2 ...
Processed data_original/ParD2/fitness.csv
Making data/TrpB3H ...
Processed data_original/TrpB3H/fitness.csv
Making data/ParD3 ...
Processed data_original/ParD3/fitness.csv
Making data/TrpB3B ...
Processed data_original/TrpB3B/fitness.csv
Making data/DHFR ...
Processed data_original/DHFR/fitness.csv
Making data/TrpB3D ...
Processed data_original/TrpB3D/fitness.csv
Making data/TrpB3A ...
Processed data_original/TrpB3A/fitness.csv
Making data/TrpB3I ...
Processed data_original/TrpB3I/fitness.csv
Making data/GB1 ...
Processed data_original/GB1/fitness.csv
Making data/TrpB3F ...
Processed data_original/TrpB3F/fitness.csv
Making data/TrpB3C ...
Processed data_original/TrpB3C/fitness.csv
Making data/TrpB3E ...
Processed data_original/TrpB3E/fitness.csv
Making data/TrpB3G ...
Processed data_original/TrpB3G/fitness.csv
