In [16]:
###initialize imports and dataset
import math
import numpy as np
import pandas as pd
import deepchem as dc
from deepchem.utils.save import load_from_disk
from deepchem.data import data_loader
import random
random.seed(0)

dataset_file = "./enamineSubset10KGroundTruth.csv"
ground_truth_dataset = load_from_disk(dataset_file) #pandas Dataframe

low_bace_dataset = ground_truth_dataset.copy(deep=True).sort_values(by="bace")[:2500] #take 2.5K worst binder potential starters

print("Columns of dataset: %s" % str(ground_truth_dataset.columns.values))
print("Number of examples in dataset: %s" % str(ground_truth_dataset.shape[0]))

Columns of dataset: ['Unnamed: 0' 'SMILES' 'esol' 'logD' 'bace']
Number of examples in dataset: 10000


In [17]:
###initialize ground truth models and methods to access them

def load_oracle_models():
    """Loads the pretrained ground truth models for evaluating molecules' properties on-the-fly.
    
    Returns
    -------
    oracle : dict
        A dictionary containing models mapped to their property keywords \"bace\", \"esol\", \"logD\".
    """
    bace_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/bace")
    esol_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/esol")
    logD_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/logD")
    bace_model.restore()
    esol_model.restore()
    logD_model.restore()
    oracle = {"bace":bace_model, "esol":esol_model, "logD":logD_model} #get each model via the named property
    return oracle

def query_oracle(dataset, oracle):
    """Evaluate molecules on-the-fly for their estimated bace, esol, and logD scores.
    
    Parameters
    ----------
    dataset : pandas.DataFrame
        The input dataset; must includes a field with smiles strings under keyword "SMILES".
    oracle : dictionary( dc.models.GraphConvModel )
        The pretrained ground truth value prediction models.
        
    Returns
    -------
    results : pandas.DataFrame
        Copy of input dataset with newly estimated bace, esol, and logD scores under those headers. 
    """
    query_file = "./temp/oracle_eval.csv"
    dataset.to_csv(query_file)
    
    results = dataset.copy(deep=True) #defensive copy of input dataframe 
    
    featurizer = dc.feat.ConvMolFeaturizer()
    for prop in ("bace", "esol", "logD"):
        #retrieve appropriate model from oracle
        model = oracle[prop]
        
        #load, featurize, and normalize input dataset
        loader = dc.data.CSVLoader(tasks=[prop], smiles_field="SMILES",featurizer=featurizer)
        dataset_feat = loader.featurize(query_file)
        transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset_feat)
        dataset_feat = transformer.transform(dataset_feat)
        
        #predict and assign property results to keyword
        predicted = model.predict(dataset_feat)
        results[prop] = predicted
        
    return results

In [18]:
###define Abstract Data Type to hold search information, including ensemble

class Experimenter():
    """Class representing a research scientist/team going through the drug development process.
    
    Parameters
    ----------
    N : int
        Number of samples to initially train the experimenter ensemble on.
    M : int
        Number of molecules to purchase in each batch.
    ensemble_size : int
        Number of models in experimenter ensemble.
    epochs : int
        Number of epochs to train ensemble models for at each stage.
    molecule_cost : int or float
        Monetary cost of purchasing a single molecule.
    target_bounds : dictionary of str:tuples(floats)
        Desired range for each property.
    
    Attributes
    ----------
    ensemble : dictionary of deepchem.models.GrachConvModel
        Models representing the experimenter knowledge/predictions and uncertainty.
    history : list of <NEEDS IMPLEMENTING>
    samples_seen : pandas.DataFrame
        All of the molecules seen before. Includes initial training set.
    cost : int or float
        Total monetary cost incurred at the current time.
    number_molecules : int
        Total number of molecules purchased at the current time.
    time : int
        Total number of days spent up to the current time.
    """
    def __init__(self, N, M, ensemble_size=3, epochs=1, molecule_cost=200,
                 target_bounds={"bace":(4, math.inf), "esol":(-5, math.inf), "logD":(-0.4, 5.6)} ):
        self.N = N #how many random samples to initially train on
        self.M = M #the size of batch to purchase
        self.ensemble_size = ensemble_size
        self.epochs = epochs
        self.molecule_cost = molecule_cost
        self.target_bounds = target_bounds
        
        self.ensemble = {i:dc.models.GraphConvModel(n_tasks=3, mode='regression', batch_size=20, random_seed=i) for i in range(3)} #map each model to its seed
        self.history = [] #save snapshot of model, on disk
        self.samples_seen = None
        self.cost = 0
        self.number_molecules = 0
        self.time = 0 #days
        
        
    def train_model(self, model, dataset):
        #take in dataset as pandas Dataframe and convert to dc Dataset via pd to_csv and dc CSVLoader
        dataset_temp_file = "./temp/training_dataset.csv"
        dataset.to_csv(dataset_temp_file)

        featurizer = dc.feat.ConvMolFeaturizer()
        loader = dc.data.CSVLoader(tasks=["bace", "logD", "esol"], smiles_field="SMILES", featurizer=featurizer)

        dataset_feat = loader.featurize(dataset_temp_file)

        transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset_feat)
        dataset_feat = transformer.transform(dataset_feat)

        model.fit(dataset_feat, nb_epoch=1, deterministic=True, restore=False)
    
    def train_ensemble(self, dataset):
        for model in self.ensemble.values():
            train_model(model, dataset)

    
    def initial_training(self):
        ###train the ensemble on a subset that binds bace poorly
        if self.N > low_bace_dataset.shape[0]:
            self.N = low_bace_dataset.shape[0]
        rand_indices = random.sample(range(low_bace_dataset.shape[0]), k=self.N) #select random row indices
        
        print("Random initial training indices selected.")

        init_ensemble_dataset = pd.DataFrame()
        for idx in rand_indices:
            init_ensemble_dataset = init_ensemble_dataset.append( low_bace_dataset.iloc[idx], ignore_index=True )
        
        print("Initial training dataset selected.")
        
        self.samples_seen = init_ensemble_dataset ### collect the examples seen during initial training
        self.cost += 200 * len(init_ensemble_dataset)
        self.number_molecules += len(init_ensemble_dataset)
        self.time = 0 ### time to initially train? free initial knowledge?
        
        print("Training ensemble...")
        self.train_ensemble(init_ensemble_dataset) #train ensemble on initialization
        print("Ensemble trained.")
        
        
    ###get_component_score, ket/list(keys),
    def get_bace_score(self, x):
        #higher bace => higher score
        return np.where(x < self.bace_range[0], 0.2*x-0.8, 0.05*x-0.2) #decrease penalty once score > lower end of range

    def get_logD_score(self, x):
        #logD within range is not penalized, 
        x = np.where(x < self.logD_range[0], x - np.absolute(x-self.logD_range[0]), x) #handle lower end of range
        return np.where(x > self.logD_range[1], x - np.absolute(x-self.logD_range[1]), x) #handle upper end of range
   
    def get_esol_score(self, x):
        return np.where(x < self.esol_range[0], x - np.absolute(x-self.logD_range[1])**2, x)
        
    def get_goodness_score(self, bace, logD, esol):
        return self.get_bace_score(bace) + self.get_logD_score(logD) + self.get_esol_score(esol)
    
    
    def score_and_select_top(self):

        featurizer = dc.feat.ConvMolFeaturizer()
        loader = dc.data.CSVLoader(tasks=["bace", "logD", "esol"], smiles_field="SMILES", featurizer=featurizer)
        dataset_feat = loader.featurize(dataset_file)
        transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset_feat)
        dataset_feat = transformer.transform(dataset_feat)
        
        predicted = np.zeros( (len(dataset_feat),3) )
        for model in self.ensemble.values():
            predicted += model.predict(dataset_feat)
        predicted /= len(self.ensemble) #take the average of model predictions
        
        #copy SMILES and assign/calculate scores
        results_df = pd.DataFrame()
        results_df["SMILES"] = ground_truth_dataset["SMILES"]   
        bace = predicted[:,0]
        logD = predicted[:,1]
        esol = predicted[:,2]
        goodness = self.get_goodness_score(bace, logD, esol)
        results_df["bace"] = bace
        results_df["logD"] = logD
        results_df["esol"] = esol
        results_df["goodness"] = goodness
        
        seen_smiles = self.samples_seen["SMILES"].tolist()
        
        unseen_rows = results_df.loc[~results_df['SMILES'].isin(seen_smiles)] #remove examples previously seen
        unseen_rows.sort_values(by="goodness", ascending=False) #sort with highest goodness at top
        
        if len(unseen_rows) > self.M:
            subset = unseen_rows[:self.M]
        else:
            subset = unseen_rows
        
        self.samples_seen.append(subset, sort=False) # ignore_index=True?
        self.cost += 200 * len(subset)
        self.number_molecules += len(subset)
            
        self.time += 28 #4 weeks to buy and experiment
        
        
    def run(self):
        while len(self.samples_seen) < len(ground_truth_dataset): #replace with top bace score to exit
            self.score_and_select_top()
            #record history
            self.train_ensemble(self.samples_seen)
            break


In [19]:
"""
Step 1: load ground truth models and ensemble

Step 2: train ensemble on N random data points (including ground truth values)

Step 3: score all of the 10K molecules using the ensemble

Step 4: take ("buy") the top M, and "assess them experimentally" (get their ground truth values)

Step 5: add those samples to the training/seen set

Step 6: retrain the ensemble

Step 7: repeat (make 2-6 repeatable)

Step 8: add some loops over N and M to generate plots of Hx vs N,M
"""

'\nStep 1: load ground truth models and ensemble\n\nStep 2: train ensemble on N random data points (including ground truth values)\n\nStep 3: score all of the 10K molecules using the ensemble\n\nStep 4: take ("buy") the top M, and "assess them experimentally" (get their ground truth values)\n\nStep 5: add those samples to the training/seen set\n\nStep 6: retrain the ensemble\n\nStep 7: repeat (make 2-6 repeatable)\n\nStep 8: add some loops over N and M to generate plots of Hx vs N,M\n'

In [20]:
N = [10] #initial train set size
M = [10] #batch size -> 96 wells, multiples
for n in N:
    for m in M:
        E = Experimenter(n, m)

In [21]:
E.target_bounds

{'bace': (4, inf), 'esol': (-5, inf), 'logD': (-0.4, 5.6)}

In [22]:
E.initial_training()

Random initial training indices selected.
Initial training dataset selected.
Training ensemble...
Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./temp/training_dataset.csv
Loading shard 1 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 0 took 0.086 s
TIMING: dataset construction took 0.145 s
Loading dataset from disk.
TIMING: dataset construction took 0.043 s
Loading dataset from disk.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Ending global_step 1: Average loss 0.761683
TIMING: model fitting took 12.007 s
Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./temp/training_dataset.csv
Loading shard 1 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 0 took 0.020 s
TIMING: dataset construction took 0.075 s
Loading dataset from disk.
TIMING: dataset construction took 0.025 s
Loading dataset from disk.
Ending global_step 1: Average loss 0.771387
TIMING: model fitting took 13.483 s
Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./temp/training_dataset.csv
Loading shard 1 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 0 took 0.023 s
TIMING: dataset construction took 0.086 s
Loading dataset from disk.
TIMING: dataset construction took 0.036 s
Loading dataset from disk.
Ending global_step 1: Average loss 0.728813
TIMING: model fitting took 14.577 s
Ensemble trained.


In [23]:
E.run()

Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./enamineSubset10KGroundTruth.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 16.421 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 1 took 3.436 s
TIMING: dataset construction took 25.362 s
Loading dataset from disk.
TIMING: dataset construction took 6.795 s
Loading dataset from disk.


AttributeError: 'Experimenter' object has no attribute 'bace_range'