In [1]:
###initialize imports and dataset
import math
import numpy as np
import pandas as pd
import deepchem as dc
from deepchem.utils.save import load_from_disk
from deepchem.data import data_loader
import random
random.seed(0)

dataset_file = "./enamineSubset10KGroundTruth.csv"
ground_truth_dataset = load_from_disk(dataset_file) #pandas Dataframe

low_bace_dataset = ground_truth_dataset.copy(deep=True).sort_values(by="bace")[:2500] #take 2500 worst binders as starting set candidates

print("Columns of dataset: %s" % str(ground_truth_dataset.columns.values))
print("Number of examples in dataset: %s" % str(ground_truth_dataset.shape[0]))

Columns of dataset: ['Unnamed: 0' 'SMILES' 'esol' 'logD' 'bace']
Number of examples in dataset: 10000


In [2]:
###initialize ground truth models and methods to access them

def load_oracle_models():
    bace_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/bace")
    esol_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/esol")
    logD_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/logD")
    bace_model.restore()
    esol_model.restore()
    logD_model.restore()
    oracle = {"bace":bace_model, "esol":esol_model, "logD":logD_model} #get each model via the named property
    return oracle
    
def query_oracle(smiles):
    ### use when evaluating on the fly
    raise NotImplementedError
    

In [28]:
#test_model = dc.models.GraphConvModel(n_tasks=3, mode='regression', batch_size=50, random_seed=0, model_dir="./models/test_model")

def train_model(model, dataset):
    #take in dataset as pandas Dataframe and convert to dc Dataset via pd to_csv and dc CSVLoader
    dataset_temp_file = "./temp/training_dataset.csv"
    dataset.to_csv(dataset_temp_file)
    
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=["bace", "logD", "esol"], smiles_field="SMILES", featurizer=featurizer)
    
    dataset_feat = loader.featurize(dataset_temp_file)

    transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset_feat)
    dataset_feat = transformer.transform(dataset_feat)
    
    model.fit(dataset_feat, nb_epoch=1, deterministic=True, restore=False)
    #results = test_model.predict(dataset_feat)
    
#train_model(test_model, low_bace_dataset[-10:])

In [45]:
###define Abstract Data Type to hold search information, including ensemble

class Experimenter():
    def __init__(self, N, M):
        self.N = N #how many random samples to initially train on
        self.M = M #the size of batch to purchase
        self.ensemble = {i:dc.models.GraphConvModel(n_tasks=3, mode='regression', batch_size=20, random_seed=i) for i in range(3)} #map each model to its seed
        self.history = [] #save snapshot of model, on disk
        self.samples_seen = None
        self.cost = 0
        self.number_molecules = 0
        self.time = 0 #days
        self.training_epochs=1
        self.ensemble_size = 3
        self.cost_per_molecule = 200
        self.target_bounds = {}
              
        self.bace_range = (4, math.inf)
        self.esol_range = (-5, math.inf)
        self.logD_range = (-0.4, 5.6)
        
    def train_model(self, model, dataset):
        #take in dataset as pandas Dataframe and convert to dc Dataset via pd to_csv and dc CSVLoader
        dataset_temp_file = "./temp/training_dataset.csv"
        dataset.to_csv(dataset_temp_file)

        featurizer = dc.feat.ConvMolFeaturizer()
        loader = dc.data.CSVLoader(tasks=["bace", "logD", "esol"], smiles_field="SMILES", featurizer=featurizer)

        dataset_feat = loader.featurize(dataset_temp_file)

        transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset_feat)
        dataset_feat = transformer.transform(dataset_feat)

        model.fit(dataset_feat, nb_epoch=1, deterministic=True, restore=False)
    
    def train_ensemble(self, dataset):
        for model in self.ensemble.values():
            train_model(model, dataset)

    
    def initial_training(self):
        ###train the ensemble on a subset that binds bace poorly
        if self.N > low_bace_dataset.shape[0]:
            self.N = low_bace_dataset.shape[0]
        rand_indices = random.sample(range(low_bace_dataset.shape[0]), k=self.N) #select random row indices
        
        print("Random initial training indices selected.")

        init_ensemble_dataset = pd.DataFrame()
        for idx in rand_indices:
            init_ensemble_dataset = init_ensemble_dataset.append( low_bace_dataset.iloc[idx], ignore_index=True )
        
        print("Initial training dataset selected.")
        
        self.samples_seen = init_ensemble_dataset ### collect the examples seen during initial training
        self.cost += 200 * len(init_ensemble_dataset)
        self.number_molecules += len(init_ensemble_dataset)
        self.time = 0 ### time to initially train? free initial knowledge?
        
        print("Training ensemble...")
        self.train_ensemble(init_ensemble_dataset) #train ensemble on initialization
        print("Ensemble trained.")
        
        
    ###get_component_score, ket/list(keys),
    def get_bace_score(self, x):
        #higher bace => higher score
        return np.where(x < self.bace_range[0], 0.2*x-0.8, 0.05*x-0.2) #decrease penalty once score > lower end of range

    def get_logD_score(self, x):
        #logD within range is not penalized, 
        x = np.where(x < self.logD_range[0], x - np.absolute(x-self.logD_range[0]), x) #handle lower end of range
        return np.where(x > self.logD_range[1], x - np.absolute(x-self.logD_range[1]), x) #handle upper end of range
   
    def get_esol_score(self, x):
        return np.where(x < self.esol_range[0], x - np.absolute(x-self.logD_range[1])**2, x)
        
    def get_goodness_score(self, bace, logD, esol):
        return get_bace_score(bace) + get_logD_score(logD) + get_esol_score(esol)
    
    
    def score_and_select_top(self):
        predicted = np.zeroes( (len(ground_truth_dataset),3) )
        for model in self.ensemble.values():
            predicted += model.predict(ground_truth_dataset)
        predicted /= len(self.ensemble) #take the average of model predictions
        
        #copy SMILES and assign/calculate scores
        results_df = pd.DataFrame()
        results_df["SMILES"] = ground_truth_dataset["SMILES"]   
        bace = predicted[:,0]
        logD = predicted[:,1]
        esol = predicted[:,2]
        goodness = self.get_goodness_score(bace, logD, esol)
        results_df["bace"] = bace
        results_df["logD"] = logD
        results_df["esol"] = esol
        results_df["goodness"] = goodness
        print(results_df)
        
        seen_smiles = self.samples_seen["SMILES"].tolist()
        
        unseen_rows = results_df.loc[~df['SMILES'].isin(seen_smiles)] #remove examples previously seen
        unseen_rows.sort_values(by="goodness", ascending=False) #sort with highest goodness at top
        
        if len(unseen_rows) > self.M:
            subset = unseen_rows[:self.M]
        else:
            subset = unseen_rows
        
        self.samples_seen.append(subset) # ignore_index=True?
        self.cost += 200 * len(subset)
        self.number_molecules += len(subset)
            
        self.time += 28 #4 weeks to buy and experiment
        
        
    def run(self):
        while len(self.seen_samples) < len(ground_truth_dataset): #replace with top bace score to exit
            self.score_and_select_top()
            #record history
            self.train_ensemble(self.samples_seen)


In [None]:
"""
Step 1: load ground truth models and ensemble

Step 2: train ensemble on N random data points (including ground truth values)

Step 3: score all of the 10K molecules using the ensemble

Step 4: take ("buy") the top M, and "assess them experimentally" (get their ground truth values)

Step 5: add those samples to the training/seen set

Step 6: retrain the ensemble

Step 7: repeat (make 2-6 repeatable)

Step 8: add some loops over N and M to generate plots of Hx vs N,M
"""

In [46]:
N = [2000] #initial train set size
M = [5000] #batch size -> 96 wells, multiples
for n in N:
    for m in M:
        E = Experimenter(n, m)
        E.initial_training()
        #e.run()

Random initial training indices selected.
Initial training dataset selected.
Training ensemble...
Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./temp/training_dataset.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 0 took 3.280 s
TIMING: dataset construction took 4.054 s
Loading dataset from disk.
TIMING: dataset construction took 1.290 s
Loading dataset from disk.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Ending global_step 100: Average loss 0.729629
TIMING: model fitting took 426.686 s
Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./temp/training_dataset.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 0 took 7.687 s
TIMING: dataset construction took 10.141 s
Loading dataset from disk.
TIMING: dataset construction took 3.146 s
Loading dataset from disk.
Ending global_step 100: Average loss 0.710652
TIMING: model fitting took 476.719 s
Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./temp/training_dataset.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 0 took 8.983 s
TIMING: dataset construction took 11.217 s
Loading dataset from disk.
TIMING: dataset construction took 4.688 s
Loading dataset from disk.
Ending global_step 100: Average loss 0.674514
TIMING: model fitting took 559.385 s
Ensemble trained.
