In [2]:
###initialize imports and dataset
import math
import numpy as np
import pandas as pd
import deepchem as dc
from deepchem.utils.save import load_from_disk
from deepchem.data import data_loader
import random
random.seed(0)

dataset_file = "./enamineSubset10KGroundTruth.csv"
ground_truth_dataset = load_from_disk(dataset_file) #pandas Dataframe

low_bace_dataset = ground_truth_dataset.copy(deep=True).sort_values(by="bace")[:2500] #take 2500 worst binders as starting set candidates

print("Columns of dataset: %s" % str(ground_truth_dataset.columns.values))
print("Number of examples in dataset: %s" % str(ground_truth_dataset.shape[0]))

Columns of dataset: ['Unnamed: 0' 'SMILES' 'esol' 'logD' 'bace']
Number of examples in dataset: 10000


In [None]:
###initialize ground truth models and methods to access them

def load_oracle_models():
    bace_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/bace")
    esol_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/esol")
    logD_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/logD")
    bace_model.restore()
    esol_model.restore()
    logD_model.restore()
    oracle = {"bace":bace_model, "esol":esol_model, "logD":logD_model} #get each model via the named property
    return oracle
    
def query_oracle(smiles):
    ### use when evaluating on the fly
    raise NotImplementedError
    

In [19]:
test_model = dc.models.GraphConvModel(n_tasks=3, mode='regression', batch_size=50, random_seed=0, model_dir="./models/test_model")

from deepchem.data.data_loader import featurize_smiles_df

def train_model(model, dataset):
    dataset_temp_file = "./temp/training_dataset.csv"
    dataset.to_csv(dataset_temp_file)
    
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=["bace", "logD", "esol"], smiles_field="SMILES", featurizer=featurizer)
    
    dataset = loader.featurize(dataset_temp_file)
    #dataset = dc.data.data_loader.featurize_smiles_df(dataset, featurizer, "SMILES", verbose=False)
    ###this needs to be converted to a dc Dataset (DiskDataset with tempdir, NumpyDataset???)

    transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
    dataset = transformer.transform(dataset)
    
    model.fit(dataset, nb_epoch=1, deterministic=True)
    
train_model(test_model, low_bace_dataset[-10:])


Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./temp/training_dataset.csv
Loading shard 1 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 0 took 0.022 s
TIMING: dataset construction took 0.095 s
Loading dataset from disk.
TIMING: dataset construction took 0.049 s
Loading dataset from disk.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Ending global_step 1: Average loss 0.645826
TIMING: model fitting took 38.598 s
[[ 1.0481931  -0.80038565  0.11046308]
 [ 1.4108983  -0.89190555 -0.3014657 ]
 [ 1.2603654  -1.2325282  -0.25898513]
 [ 1.1178118  -1.1508771  -0.16457354]
 [ 1.4231299  -0.7285227   0.07284688]
 [ 0.9994026  -0.58478886 -0.14232585]
 [ 1.1015353  -0.8567252   0.08157533]
 [ 0.844594   -0.6443539   0.19544438]
 [ 1.5114546  -1.0783786  -0.1482876 ]
 [ 1.0526407  -1.2137191  -0.13992053]]
<class 'numpy.ndarray'>
[[ 0.844594   -0.6443539   0.19544438]
 [ 0.9994026  -0.58478886 -0.14232585]
 [ 1.0481931  -0.80038565  0.11046308]
 [ 1.0526407  -1.2137191  -0.13992053]
 [ 1.1015353  -0.8567252   0.08157533]
 [ 1.1178118  -1.1508771  -0.16457354]
 [ 1.2603654  -1.2325282  -0.25898513]
 [ 1.4108983  -0.89190555 -0.3014657 ]
 [ 1.4231299  -0.7285227   0.07284688]
 [ 1.5114546  -1.0783786  -0.1482876 ]]


In [26]:
###define Abstract Data Type to hold search information, including ensemble

class Experimenter():
    def __init__(self, N, M):
        self.N = N #how many random samples to initially train on
        self.M = M #the size of batch to purchase
        self.ensemble = {i:dc.models.GraphConvModel(n_tasks=3, mode='regression', batch_size=50, random_seed=i) for i in range(100)} #map each model to its seed
        self.history = []
        
        ###train the ensemble on a subset that binds bace poorly
        if N > dataset.shape[0]:
            N = dataset.shape[0]
        rand_indices = random.sample(range(dataset.shape[0]), k=N) #select random row indices

        init_ensemble_dataset = pd.DataFrame()
        for idx in rand_indices:
            init_ensemble_dataset = init_ensemble_dataset.append( low_bace_dataset.iloc[idx], ignore_index=True )
        
        self.samples_seen = init_ensemble_dataset ### collect the examples seen during initial training
        
        train_ensemble(init_ensemble_dataset) #train ensemble on initialization
    
    
    def train_ensemble(self, dataset):
        for model in self.ensemble:
            train_model(model, dataset)
    
    
    def score_and_select_top():
        featurizer = dc.feat.ConvMolFeaturizer()
        loader = dc.data.CSVLoader(tasks=["bace", "logD", "esol"], smiles_field="SMILES", featurizer=featurizer)
        dataset = loader.featurize()
        
        predicted = np.zeroes( (len(dataset),3) )
        for model in self.ensemble.values():
            predicted += model.predict(ground_truth_dataset)
                              
        predicted /= len(self.ensemble) #take the average of the model predictions
        
        ### need some way to attach smiles labels to the scores -> DF, auto line up                      
                              
        ### ADD compute goodness scores, REPLACE sort to sort on goodness
        predicted[predicted[:,0].argsort()] #sort by first column (bace affinity), highest at bottom

        return predicted[-self.M:,:] #return molecules with M highest goodness scores
    
    
    def run()
        while not STOP_CONDITION:
            score_and_select_top()
            #record history
            #update dataset
            train_ensemble(self.samples_seen)
        raise NotImplementedError

In [None]:
###define property preference fxns and ranges, cost fxns

bace_range = (4, math.inf)
esol_range = (-5, math.inf)
logD_range = (-0.4, 5.6)

### all of these reference ground truth dataframe
def get_bace_score():
    raise NotImplementedError
    
def get_esol_score():
    raise NotImplementedError
    
def get_logD_score():
    raise NotImplementedError
    
def get_goodness_score():
    raise NotImplementedError

In [None]:
"""
Step 1: load ground truth models and ensemble

Step 2: train ensemble on N random data points (including ground truth values)

Step 3: score all of the 10K molecules using the ensemble

Step 4: take ("buy") the top M, and "assess them experimentally" (get their ground truth values)

Step 5: add those samples to the training/seen set

Step 6: retrain the ensemble

Step 7: repeat (make 2-6 repeatable)

Step 8: add some loops over N and M to generate plots of Hx vs N,M
"""

In [None]:
for n in N_set:
    for m in M_set:
        E = Experimenter(n, m)
        e.run()