In [1]:
%cd ~/ALDE

/disk2/fli/ALDE


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import torch
from datetime import datetime
import glob
import os
import math
import pandas as pd

In [4]:
def index2regret(indices, y):
    """
    Converts list of queried indices to regret (difference between the max value in the deisgn space and the max queired value)
    """
    indices = np.array(indices, dtype=int)
    regret = torch.zeros((indices.shape[0], indices.shape[1]))
    for i in range(indices.shape[0]):
        for j in range(indices.shape[1]):
            regret[i, j] = 1 - y[indices[i, :j+1]].max()
    return regret

In [5]:
def load_tensors(subdir):
    """
    loads al indices from a directory and converts them to regret values
    """
    tensors = sorted(glob.glob(subdir + '/*indices*.pt'))
        
    tests = {}
    print('Models not included/not over budget yet:\n')

    for tensor in tensors:
        num = int(tensor.split('indices')[0].split('_')[-1])

        if '.pt' in tensor and 'state_dict' not in tensor:
            first = False
            if "Random" not in os.path.basename(tensor):
                nm = os.path.basename(tensor).split('_')[0] + '_' + os.path.basename(tensor).split('_')[1]
            else:     
                nm = os.path.basename(tensor).split('_')[0]

            t = torch.load(tensor).cpu().detach()
            t = torch.reshape(t, (1, -1))

            lim = 480
            if t.size(-1) < lim:
                if 'indices.pt' in tensor:
                    print(tensor.split('/')[-1])
                continue
            if nm in tests.keys():
                d = tests[nm]
            else:
                d = {}
                tests[nm] = d

            dtype = os.path.basename(tensor).split('_')[-1].split('.')[0]
            dtype = ''.join([i for i in dtype if not i.isdigit()])
            if dtype in d.keys():
                arr = d[dtype]
            else:
                arr = t
                d[dtype] = arr
                first = True
            if first:
                pass
            elif t.size(-1) < arr.size(-1):
                arr = torch.cat((arr[:,:t.size(-1)], t), 0)
            else:
                arr = torch.cat((arr, t[:,:arr.size(-1)]), 0)
            d[dtype] = arr
            tests[nm] = d

    print('\nModels included:\n')
    batch = {}
    budget, total= math.inf, math.inf
    for key in tests.keys():
        print(key)
        num_runs = -1
        for dtype in tests[key].keys():
            t = tests[key][dtype]
            t = index2regret(t, y)
            
            if 'indices' == dtype and t.size(-1) != 0:
                if t.size(-1) < budget:
                    budget = t.size(-1)
                num_runs = t.size(0)
            elif 'y' in dtype and t.size(-1) < total and t.size(-1) != 0:
                total = t.size(-1)
            
            sd, mean = torch.std_mean(t, 0)
            sem = sd / (t.size(0)**.5)
            if dtype in batch.keys():
                d = batch[dtype]
            else:
                d = {}
                batch[dtype] = d

            d[key] = (mean, sem)

            batch[dtype] = d
        print("Runs: {}".format(num_runs))
    print(batch.keys())
    print('Budget: {}'.format(budget))
    print('Total queries (incl. init): {}'.format(budget))

    return batch, budget

In [6]:
def tabulate_regret(df, tests, budget, subdir, randregret=None):
    """
    Tabulates loaded regret values into an organized dataframe.
    """
    names = []
    queries = np.arange(budget) + 1
    
    for name in sorted(tests.keys()):
        names.append(name)

        mean, sem = tests[name]
        mean = 1 - mean
        if mean.size(0) < budget:
            names.pop()
            continue
        if mean.size(0) > budget:
            mean = mean[:budget]
            sem = sem[:budget]

        protein = subdir.split('/')[-2]
        encoding = subdir.split('/')[-1]

        if 'Random' in name:
            encoding = 'Random'
            model = 'Random'
            acquisition = 'Random'
        else:
            names = name.split('-')
            model = names[0]
            acquisition = names[-2]

        for timestep, single_mean, single_std in zip(queries, np.array(mean), np.array(sem)):
            df.loc[len(df.index)] = [protein, encoding, model, acquisition, timestep, single_mean, single_std]
    return df

In [7]:
#Loop over different datasets and encodings
#tabulate all max fitness values into one dataframe
df = pd.DataFrame(columns=['Protein', 'Encoding', 'Model', 'Acquisition', 'Timestep', 'Mean', 'Std'])

for protein in ["DHFR", "GB1", "ParD2", "ParD3", "TrpB3A", "TrpB3B", "TrpB3C", "TrpB3D", "TrpB3E", "TrpB3F", "TrpB3G", "TrpB3H", "TrpB3I", "TrpB4"]:
    for encoding in ['onehot']:
        subdir =  'results/384+96+baseline/' + protein + '/' + encoding
        fitness_df = pd.read_csv('data/' + protein + '/fitness.csv')
        y = fitness_df['fitness'].values
        y = y/y.max()
       
        batch, budget = load_tensors(subdir)
        df = tabulate_regret(df, batch['indices'], budget, subdir, randregret=None)

Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models 

In [8]:
#clean up the results and save
df = df.drop_duplicates(subset=['Protein', 'Encoding', 'Model', 'Acquisition', 'Timestep'], keep='first')
df['Model'] = df['Model'].replace('BOOSTING_ENSEMBLE', 'Boosting Ensemble')
df['Model'] = df['Model'].replace('GP_BOTORCH', 'GP')
df['Model'] = df['Model'].replace('DNN_ENSEMBLE', 'DNN Ensemble')
df['Model'] = df['Model'].replace('DKL_BOTORCH', 'DKL')
df['Acquisition'] = df['Acquisition'].replace('Random', 'GREEDY')
df.to_csv('results/384+96+baseline/all_results.csv', index=False)
df

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
0,DHFR,onehot,DNN Ensemble,GREEDY,1,0.127463,0.001272
1,DHFR,onehot,DNN Ensemble,GREEDY,2,0.134273,0.002893
2,DHFR,onehot,DNN Ensemble,GREEDY,3,0.138818,0.003078
3,DHFR,onehot,DNN Ensemble,GREEDY,4,0.147883,0.003791
4,DHFR,onehot,DNN Ensemble,GREEDY,5,0.156102,0.004070
...,...,...,...,...,...,...,...
13435,TrpB4,Random,Random,GREEDY,476,0.545850,0.016771
13436,TrpB4,Random,Random,GREEDY,477,0.545850,0.016771
13437,TrpB4,Random,Random,GREEDY,478,0.545850,0.016771
13438,TrpB4,Random,Random,GREEDY,479,0.545850,0.016771


In [9]:
df[df["Encoding"] == "Random"].Protein.unique()

array(['DHFR', 'GB1', 'ParD2', 'ParD3', 'TrpB3A', 'TrpB3B', 'TrpB3C',
       'TrpB3D', 'TrpB3E', 'TrpB3F', 'TrpB3G', 'TrpB3H', 'TrpB3I',
       'TrpB4'], dtype=object)

In [10]:
len(df[df["Encoding"] == "Random"]), len(df[df["Encoding"] == "onehot"])

(6720, 6720)

In [11]:
df[(df["Encoding"] == "onehot") & (df["Protein"] == "TrpB4")]

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
12480,TrpB4,onehot,DNN Ensemble,GREEDY,1,-0.007722,0.002754
12481,TrpB4,onehot,DNN Ensemble,GREEDY,2,0.007735,0.002825
12482,TrpB4,onehot,DNN Ensemble,GREEDY,3,0.016794,0.002755
12483,TrpB4,onehot,DNN Ensemble,GREEDY,4,0.020837,0.002715
12484,TrpB4,onehot,DNN Ensemble,GREEDY,5,0.028109,0.004071
...,...,...,...,...,...,...,...
12955,TrpB4,onehot,DNN Ensemble,GREEDY,476,0.574924,0.015302
12956,TrpB4,onehot,DNN Ensemble,GREEDY,477,0.574924,0.015302
12957,TrpB4,onehot,DNN Ensemble,GREEDY,478,0.574924,0.015302
12958,TrpB4,onehot,DNN Ensemble,GREEDY,479,0.574924,0.015302


In [12]:
df[(df["Encoding"] == "onehot") & (df["Timestep"] == 480)]

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
479,DHFR,onehot,DNN Ensemble,GREEDY,480,0.903667,0.010632
1439,GB1,onehot,DNN Ensemble,GREEDY,480,0.750532,0.018383
2399,ParD2,onehot,DNN Ensemble,GREEDY,480,1.0,0.0
3359,ParD3,onehot,DNN Ensemble,GREEDY,480,0.989638,0.000567
4319,TrpB3A,onehot,DNN Ensemble,GREEDY,480,0.399707,0.047229
5279,TrpB3B,onehot,DNN Ensemble,GREEDY,480,0.219586,0.033204
6239,TrpB3C,onehot,DNN Ensemble,GREEDY,480,0.357302,0.026715
7199,TrpB3D,onehot,DNN Ensemble,GREEDY,480,0.72106,0.023945
8159,TrpB3E,onehot,DNN Ensemble,GREEDY,480,0.295033,0.042731
9119,TrpB3F,onehot,DNN Ensemble,GREEDY,480,0.352908,0.034582


In [4]:
df = pd.read_csv('results/384+96+baseline/all_results.csv')
df

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
0,DHFR,onehot,DNN Ensemble,GREEDY,1,0.127463,0.001272
1,DHFR,onehot,DNN Ensemble,GREEDY,2,0.134273,0.002893
2,DHFR,onehot,DNN Ensemble,GREEDY,3,0.138818,0.003078
3,DHFR,onehot,DNN Ensemble,GREEDY,4,0.147883,0.003791
4,DHFR,onehot,DNN Ensemble,GREEDY,5,0.156102,0.004070
...,...,...,...,...,...,...,...
13435,TrpB4,Random,Random,GREEDY,476,0.545850,0.016771
13436,TrpB4,Random,Random,GREEDY,477,0.545850,0.016771
13437,TrpB4,Random,Random,GREEDY,478,0.545850,0.016771
13438,TrpB4,Random,Random,GREEDY,479,0.545850,0.016771


In [6]:
res_col = df[(df["Encoding"] == "onehot") & (df["Timestep"] == 480)]["Mean"]
res_col.mean(), res_col.std()

(0.5765213514285714, 0.26350686507965904)

In [13]:
#Loop over different datasets and encodings
#tabulate all max fitness values into one dataframe
df2 = pd.DataFrame(columns=['Protein', 'Encoding', 'Model', 'Acquisition', 'Timestep', 'Mean', 'Std'])

for protein in ["DHFR", "GB1", "ParD2", "ParD3", "TrpB3A", "TrpB3B", "TrpB3C", "TrpB3D", "TrpB3E", "TrpB3F", "TrpB3G", "TrpB3H", "TrpB3I", "TrpB4"]:
    for encoding in ['onehot']:
        subdir =  'results/96+96+384/' + protein + '/' + encoding
        fitness_df = pd.read_csv('data/' + protein + '/fitness.csv')
        y = fitness_df['fitness'].values
        y = y/y.max()
       
        batch, budget = load_tensors(subdir)
        df2 = tabulate_regret(df2, batch['indices'], budget, subdir, randregret=None)

Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models included:

DNN_ENSEMBLE-DO-0-RBF-GREEDY-[30, 1]
Runs: 50
Random
Runs: 50
dict_keys(['indices'])
Budget: 480
Total queries (incl. init): 480
Models not included/not over budget yet:


Models 

In [14]:
#clean up the results and save
df2 = df2.drop_duplicates(subset=['Protein', 'Encoding', 'Model', 'Acquisition', 'Timestep'], keep='first')
df2['Model'] = df2['Model'].replace('BOOSTING_ENSEMBLE', 'Boosting Ensemble')
df2['Model'] = df2['Model'].replace('GP_BOTORCH', 'GP')
df2['Model'] = df2['Model'].replace('DNN_ENSEMBLE', 'DNN Ensemble')
df2['Model'] = df2['Model'].replace('DKL_BOTORCH', 'DKL')
df2['Acquisition'] = df2['Acquisition'].replace('Random', 'GREEDY')
df2.to_csv('results/96+96+384/all_results.csv', index=False)
df2

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
0,DHFR,onehot,DNN Ensemble,GREEDY,1,0.131625,0.002329
1,DHFR,onehot,DNN Ensemble,GREEDY,2,0.138433,0.003048
2,DHFR,onehot,DNN Ensemble,GREEDY,3,0.139826,0.003173
3,DHFR,onehot,DNN Ensemble,GREEDY,4,0.143688,0.003200
4,DHFR,onehot,DNN Ensemble,GREEDY,5,0.157431,0.007018
...,...,...,...,...,...,...,...
13435,TrpB4,Random,Random,GREEDY,476,0.542627,0.017087
13436,TrpB4,Random,Random,GREEDY,477,0.542627,0.017087
13437,TrpB4,Random,Random,GREEDY,478,0.542627,0.017087
13438,TrpB4,Random,Random,GREEDY,479,0.542627,0.017087


In [15]:
df2[(df2["Encoding"] == "onehot") & (df2["Timestep"] == 480)]

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
479,DHFR,onehot,DNN Ensemble,GREEDY,480,0.999711,0.000289
1439,GB1,onehot,DNN Ensemble,GREEDY,480,0.932049,0.013785
2399,ParD2,onehot,DNN Ensemble,GREEDY,480,0.998067,0.000727
3359,ParD3,onehot,DNN Ensemble,GREEDY,480,0.99158,0.000802
4319,TrpB3A,onehot,DNN Ensemble,GREEDY,480,0.288332,0.036041
5279,TrpB3B,onehot,DNN Ensemble,GREEDY,480,0.23144,0.028463
6239,TrpB3C,onehot,DNN Ensemble,GREEDY,480,0.346909,0.02497
7199,TrpB3D,onehot,DNN Ensemble,GREEDY,480,0.621035,0.022105
8159,TrpB3E,onehot,DNN Ensemble,GREEDY,480,0.40209,0.044415
9119,TrpB3F,onehot,DNN Ensemble,GREEDY,480,0.308876,0.020777


In [7]:
df2 = pd.read_csv('results/96+96+384/all_results.csv')
df2

Unnamed: 0,Protein,Encoding,Model,Acquisition,Timestep,Mean,Std
0,DHFR,onehot,DNN Ensemble,GREEDY,1,0.131625,0.002329
1,DHFR,onehot,DNN Ensemble,GREEDY,2,0.138433,0.003048
2,DHFR,onehot,DNN Ensemble,GREEDY,3,0.139826,0.003173
3,DHFR,onehot,DNN Ensemble,GREEDY,4,0.143688,0.003200
4,DHFR,onehot,DNN Ensemble,GREEDY,5,0.157431,0.007018
...,...,...,...,...,...,...,...
13435,TrpB4,Random,Random,GREEDY,476,0.542627,0.017087
13436,TrpB4,Random,Random,GREEDY,477,0.542627,0.017087
13437,TrpB4,Random,Random,GREEDY,478,0.542627,0.017087
13438,TrpB4,Random,Random,GREEDY,479,0.542627,0.017087


In [8]:
res_col2 = df2[(df2["Encoding"] == "onehot") & (df2["Timestep"] == 480)]["Mean"]
res_col2.mean(), res_col2.std()

(0.5940152164285716, 0.29419134637597044)