In [1]:
%cd ~/alde4ssmula

/disk2/fli/alde4ssmula


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import torch
from datetime import datetime
import glob
import os
import math
import pandas as pd

In [4]:
def index2regret(indices, y):
    """
    Converts list of queried indices to regret (difference between the max value in the deisgn space and the max queired value)
    """
    indices = np.array(indices, dtype=int)
    regret = torch.zeros((indices.shape[0], indices.shape[1]))
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            regret[i, j] = 1 - y[indices[i, :j+1]].max()
    return regret

In [5]:
def load_tensors(subdir, y):
    """
    loads al indices from a directory and converts them to regret values
    """
    tensors = sorted(glob.glob(subdir + '/*indices*.pt'))
        
    tests = {}
    print('Models not included/not over budget yet:\n')

    for tensor in tensors:
        num = int(tensor.split('indices')[0].split('_')[-1])

        if '.pt' in tensor and 'state_dict' not in tensor:
            first = False
            if "Random" not in os.path.basename(tensor):
                nm = os.path.basename(tensor).split('_')[0] + '_' + os.path.basename(tensor).split('_')[1]
            else:     
                nm = os.path.basename(tensor).split('_')[0]

            t = torch.load(tensor).cpu().detach()
            t = torch.reshape(t, (1, -1))

            # lim = 480
            # if t.size(-1) < lim:
            #     if 'indices.pt' in tensor:
            #         print(tensor.split('/')[-1])
            #     continue
            if nm in tests.keys():
                d = tests[nm]
            else:
                d = {}
                tests[nm] = d

            dtype = os.path.basename(tensor).split('_')[-1].split('.')[0]
            dtype = ''.join([i for i in dtype if not i.isdigit()])
            if dtype in d.keys():
                arr = d[dtype]
            else:
                arr = t
                d[dtype] = arr
                first = True
            if first:
                pass
            elif t.size(-1) < arr.size(-1):
                arr = torch.cat((arr[:,:t.size(-1)], t), 0)
            else:
                arr = torch.cat((arr, t[:,:arr.size(-1)]), 0)
            d[dtype] = arr
            tests[nm] = d

    print('\nModels included:\n')
    batch = {}
    budget, total= math.inf, math.inf
    for key in tests.keys():
        # print(key)
        num_runs = -1
        for dtype in tests[key].keys():
            t = tests[key][dtype]
            t = index2regret(t, y)
            
            if 'indices' == dtype and t.size(-1) != 0:
                if t.size(-1) < budget:
                    budget = t.size(-1)
                num_runs = t.size(0)
            elif 'y' in dtype and t.size(-1) < total and t.size(-1) != 0:
                total = t.size(-1)
            
            sd, mean = torch.std_mean(t, 0)
            sem = sd / (t.size(0)**.5)
            # add fraction achieve max regret
            frac_max_mean = torch.where(t == 0, torch.tensor(1.), torch.tensor(0.)).sum(dim=0) / t.size(0)
        
            
            if dtype in batch.keys():
                d = batch[dtype]
            else:
                d = {}
                batch[dtype] = d

            d[key] = (mean, sem, frac_max_mean)

            batch[dtype] = d
        print("Runs: {}".format(num_runs))
    print(batch.keys())
    print('Budget: {}'.format(budget))
    print('Total queries (incl. init): {}'.format(budget))

    return batch, budget

In [6]:
def tabulate_regret(df, tests, budget, subdir, randregret=None):
    """
    Tabulates loaded regret values into an organized dataframe.
    """
    names = []
    queries = np.arange(budget) + 1
    
    for name in sorted(tests.keys()):
        names.append(name)

        mean, sem, frac_max = tests[name]
        mean = 1 - mean
        if mean.size(0) < budget:
            names.pop()
            continue
        if mean.size(0) > budget:
            mean = mean[:budget]
            sem = sem[:budget]

        protein = subdir.split('/')[-2]
        encoding = subdir.split('/')[-1]

        if 'Random' in name:
            encoding = 'Random'
            model = 'Random'
            acquisition = 'Random'
        else:
            names = name.split('-')
            model = names[0]
            acquisition = names[-2]

        for timestep, single_mean, single_std, single_frac_max in zip(queries, np.array(mean), np.array(sem), np.array(frac_max)):
            df.loc[len(df.index)] = [protein, encoding, model, acquisition, timestep, single_mean, single_std, single_frac_max]
    return df

In [73]:
index_pt = "/disk2/fli/alde4ssmula/results/4eq_30/ParD3/onehot/Random_24indices.pt"

In [74]:
t = torch.load(index_pt).cpu().detach().numpy().tolist()

In [75]:
df = pd.read_csv("/disk2/fli/alde4ssmula/data/ParD3/fitness.csv")

In [76]:
i22_df = df.iloc[t]
i22_df

Unnamed: 0,Combo,fitness
651,CPW,-0.019547
703,CSM,0.222773
754,CWC,0.124849
824,DCR,0.889464
1730,FIN,0.100949
...,...,...
7316,WNG,0.049036
7377,WRL,0.490355
7479,WYQ,0.256487
7715,YNN,0.071595


In [77]:
i22_df.iloc[:30]

Unnamed: 0,Combo,fitness
651,CPW,-0.019547
703,CSM,0.222773
754,CWC,0.124849
824,DCR,0.889464
1730,FIN,0.100949
2614,HPI,0.001712
2634,HQI,0.674619
2900,IHT,0.432697
2978,IMR,0.244391
3018,IPR,-0.010507


In [78]:
i22_df.iloc[31:60]

Unnamed: 0,Combo,fitness
3,AAE,0.828332
125,AHG,0.037661
178,AKW,0.92143
261,AQD,0.396037
313,ASR,0.228878
403,CAG,0.050884
406,CAK,0.268261
424,CCK,0.099347
482,CFI,0.304792
501,CGI,0.152847


In [79]:
i22_df.iloc[61:90]

Unnamed: 0,Combo,fitness
2577,HMK,0.266129
2614,HPI,0.001712
2634,HQI,0.674619
2640,HQQ,0.581314
2652,HRG,0.173389
2703,HTT,0.378826
2730,HWE,0.498909
2900,IHT,0.432697
2949,ILG,0.081899
2978,IMR,0.244391


In [80]:
i22_df.iloc[91:]

Unnamed: 0,Combo,fitness
4680,NTE,0.722035
4783,PCN,-0.068334
4800,PDQ,0.061856
4867,PHL,0.266665
4927,PLT,-0.023361
5081,PWE,0.045875
5169,QEA,0.433708
5262,QIV,0.719616
5394,QRT,0.540288
5433,QTS,0.26516


In [9]:
# for res_dir in (glob.glob('results/all_*')):
for res_dir in [
    "/disk2/fli/alde4ssmula/results/4eq_120"
]:
    print(res_dir)

    #Loop over different datasets and encodings
    #tabulate all max fitness values into one dataframe
    df = pd.DataFrame(columns=['Protein', 'Encoding', 'Model', 'Acquisition', 'Timestep', 'Mean', 'Std', "Frac"])

    for protein in ["DHFR", "GB1", "ParD2", "ParD3", "TrpB3A", "TrpB3B", "TrpB3C", "TrpB3D", "TrpB3E", "TrpB3F", "TrpB3G", "TrpB3H", "TrpB3I", "TrpB4"]:
        for encoding in ['onehot']:
            subdir =  res_dir + '/' + protein + '/' + encoding
            fitness_df = pd.read_csv('data/' + protein + '/fitness.csv')
            y = fitness_df['fitness'].values
            y = y/y.max()

            print(subdir)
        
            batch, budget = load_tensors(subdir, y)
            
            df = tabulate_regret(df, batch['indices'], budget, subdir, randregret=None)

    #clean up the results and save
    df = df.drop_duplicates(subset=['Protein', 'Encoding', 'Model', 'Acquisition', 'Timestep'], keep='first')
    df['Model'] = df['Model'].replace('BOOSTING_ENSEMBLE', 'Boosting Ensemble')
    df['Model'] = df['Model'].replace('GP_BOTORCH', 'GP')
    df['Model'] = df['Model'].replace('DNN_ENSEMBLE', 'DNN Ensemble')
    df['Model'] = df['Model'].replace('DKL_BOTORCH', 'DKL')
    df['Acquisition'] = df['Acquisition'].replace('Random', 'GREEDY')
    df.to_csv(res_dir + '/all_results.csv', index=False)

    del df

/disk2/fli/alde4ssmula/results/4eq_120
/disk2/fli/alde4ssmula/results/4eq_120/DHFR/onehot
Models not included/not over budget yet:


Models included:

Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
/disk2/fli/alde4ssmula/results/4eq_120/GB1/onehot
Models not included/not over budget yet:


Models included:

Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
/disk2/fli/alde4ssmula/results/4eq_120/ParD2/onehot
Models not included/not over budget yet:


Models included:

Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
/disk2/fli/alde4ssmula/results/4eq_120/ParD3/onehot
Models not included/n

In [10]:
df = pd.read_csv("/disk2/fli/alde4ssmula/results/4eq_120/all_results.csv")

In [11]:
df[(df["Encoding"]=="onehot") & (df["Acquisition"] == "GREEDY") & (df["Model"] == "Boosting Ensemble") & (df["Timestep"] == 480)]

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std,Frac
479,DHFR,onehot,Boosting Ensemble,GREEDY,480,0.961414,0.008266,0.62
6719,GB1,onehot,Boosting Ensemble,GREEDY,480,0.79649,0.019705,0.18
12959,ParD2,onehot,Boosting Ensemble,GREEDY,480,0.999651,0.000244,0.96
19199,ParD3,onehot,Boosting Ensemble,GREEDY,480,0.997652,0.000673,0.8
25439,TrpB3A,onehot,Boosting Ensemble,GREEDY,480,0.73122,0.054823,0.62
31679,TrpB3B,onehot,Boosting Ensemble,GREEDY,480,0.291806,0.04811,0.18
37919,TrpB3C,onehot,Boosting Ensemble,GREEDY,480,0.372879,0.019907,0.04
44159,TrpB3D,onehot,Boosting Ensemble,GREEDY,480,0.978643,0.007158,0.64
50399,TrpB3E,onehot,Boosting Ensemble,GREEDY,480,0.577173,0.062817,0.5
56639,TrpB3F,onehot,Boosting Ensemble,GREEDY,480,0.982492,0.017508,0.98


In [13]:
df[(df["Encoding"]=="onehot") & (df["Acquisition"] == "GREEDY") & (df["Model"] == "DNN Ensemble") & (df["Timestep"] == 384)]

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std,Frac
3263,DHFR,onehot,DNN Ensemble,GREEDY,384,0.999301,0.000489,0.96
9503,GB1,onehot,DNN Ensemble,GREEDY,384,0.900271,0.016375,0.5
15743,ParD2,onehot,DNN Ensemble,GREEDY,384,0.999302,0.000338,0.92
21983,ParD3,onehot,DNN Ensemble,GREEDY,384,0.992558,0.000801,0.36
28223,TrpB3A,onehot,DNN Ensemble,GREEDY,384,0.627585,0.057469,0.52
34463,TrpB3B,onehot,DNN Ensemble,GREEDY,384,0.318786,0.052496,0.22
40703,TrpB3C,onehot,DNN Ensemble,GREEDY,384,0.347858,0.025051,0.06
46943,TrpB3D,onehot,DNN Ensemble,GREEDY,384,0.98922,0.004549,0.74
53183,TrpB3E,onehot,DNN Ensemble,GREEDY,384,0.57752,0.063542,0.48
59423,TrpB3F,onehot,DNN Ensemble,GREEDY,384,1.0,0.0,1.0


In [12]:
df[(df["Encoding"]=="onehot") & (df["Acquisition"] == "GREEDY") & (df["Model"] == "Boosting Ensemble") & (df["Timestep"] == 480)]

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std,Frac
479,DHFR,onehot,Boosting Ensemble,GREEDY,480,0.961414,0.008266,0.62
6719,GB1,onehot,Boosting Ensemble,GREEDY,480,0.79649,0.019705,0.18
12959,ParD2,onehot,Boosting Ensemble,GREEDY,480,0.999651,0.000244,0.96
19199,ParD3,onehot,Boosting Ensemble,GREEDY,480,0.997652,0.000673,0.8
25439,TrpB3A,onehot,Boosting Ensemble,GREEDY,480,0.73122,0.054823,0.62
31679,TrpB3B,onehot,Boosting Ensemble,GREEDY,480,0.291806,0.04811,0.18
37919,TrpB3C,onehot,Boosting Ensemble,GREEDY,480,0.372879,0.019907,0.04
44159,TrpB3D,onehot,Boosting Ensemble,GREEDY,480,0.978643,0.007158,0.64
50399,TrpB3E,onehot,Boosting Ensemble,GREEDY,480,0.577173,0.062817,0.5
56639,TrpB3F,onehot,Boosting Ensemble,GREEDY,480,0.982492,0.017508,0.98


In [21]:

def checkNgen_folder(folder_path: str) -> str:

    """
    Check if the folder and its subfolder exists
    create a new directory if not
    Args:
    - folder_path: str, the folder path
    """

    split_list = os.path.normpath(folder_path).split("/")
    
    # check if absolute
    if os.path.isabs(folder_path):
        split_list[0] = "/" + split_list[0]

    for p, _ in enumerate(split_list):
        subfolder_path = "/".join(split_list[: p + 1])
        if not os.path.exists(subfolder_path):
            print(f"Making {subfolder_path} ...")
            os.mkdir(subfolder_path)
    return folder_path
def preprocessing_data(path_pattern: str = 'data_original/*/*.csv') -> None:
    for file_path in glob.glob(path_pattern):
        output_path = file_path.replace('data_original', 'data')
        checkNgen_folder(os.path.dirname(output_path))

        df = pd.read_csv(file_path)
        # get rid of rows with * in the name of AAs
        df = df[~df['AAs'].str.contains('\*')].copy()
        df = df.rename(columns={'AAs': 'Combo'})
        df[["Combo", "fitness"]].copy().to_csv(output_path, index=False)
        print(f'Processed {file_path}')

In [22]:
preprocessing_data()

Making data ...
Making data/TrpB4 ...
Processed data_original/TrpB4/fitness.csv
Making data/ParD2 ...
Processed data_original/ParD2/fitness.csv
Making data/TrpB3H ...
Processed data_original/TrpB3H/fitness.csv
Making data/ParD3 ...
Processed data_original/ParD3/fitness.csv
Making data/TrpB3B ...
Processed data_original/TrpB3B/fitness.csv
Making data/DHFR ...
Processed data_original/DHFR/fitness.csv
Making data/TrpB3D ...
Processed data_original/TrpB3D/fitness.csv
Making data/TrpB3A ...
Processed data_original/TrpB3A/fitness.csv
Making data/TrpB3I ...
Processed data_original/TrpB3I/fitness.csv
Making data/GB1 ...
Processed data_original/GB1/fitness.csv
Making data/TrpB3F ...
Processed data_original/TrpB3F/fitness.csv
Making data/TrpB3C ...
Processed data_original/TrpB3C/fitness.csv
Making data/TrpB3E ...
Processed data_original/TrpB3E/fitness.csv
Making data/TrpB3G ...
Processed data_original/TrpB3G/fitness.csv
