In [1]:
###initialize imports and dataset
import math
import numpy as np
import pandas as pd
import deepchem as dc
from deepchem.utils.save import load_from_disk
from deepchem.data import data_loader
import random
random.seed(0)

dataset_file = "./enamineSubset10KGroundTruth.csv"
ground_truth_dataset = load_from_disk(dataset_file) #pandas Dataframe

low_bace_dataset = ground_truth_dataset.copy(deep=True).sort_values(by="bace")[:2500] #take 2500 worst binders as starting set candidates

print("Columns of dataset: %s" % str(ground_truth_dataset.columns.values))
print("Number of examples in dataset: %s" % str(ground_truth_dataset.shape[0]))

Columns of dataset: ['Unnamed: 0' 'SMILES' 'esol' 'logD' 'bace']
Number of examples in dataset: 10000


In [None]:
###initialize ground truth models and methods to access them

def load_oracle_models():
    bace_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/bace")
    esol_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/esol")
    logD_model = dc.models.GraphConvModel(n_tasks=1, mode='regression', batch_size=50, random_seed=0, model_dir="./models/logD")
    bace_model.restore()
    esol_model.restore()
    logD_model.restore()
    oracle = {"bace":bace_model, "esol":esol_model, "logD":logD_model} #get each model via the named property
    return oracle
    
def query_oracle(smiles):
    ### use when evaluating on the fly
    raise NotImplementedError
    

In [15]:
#test_model = dc.models.GraphConvModel(n_tasks=3, mode='regression', batch_size=50, random_seed=0, model_dir="./models/test_model")

from deepchem.data.data_loader import featurize_smiles_df

def train_model(model, dataset):
    #take in dataset as pandas Dataframe and convert to dc Dataset via pd to_csv and dc CSVLoader
    dataset_temp_file = "./temp/training_dataset.csv"
    dataset.to_csv(dataset_temp_file)
    
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=["bace", "logD", "esol"], smiles_field="SMILES", featurizer=featurizer)
    
    dataset_feat = loader.featurize(dataset_temp_file)

    transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset_feat)
    dataset_feat = transformer.transform(dataset_feat)
    
    model.fit(dataset_feat, nb_epoch=1, deterministic=True)
    #results = test_model.predict(dataset_feat)
    
#train_model(test_model, low_bace_dataset[-10:])

In [26]:
###define Abstract Data Type to hold search information, including ensemble

class Experimenter():
    def __init__(self, N, M):
        self.N = N #how many random samples to initially train on
        self.M = M #the size of batch to purchase
        self.ensemble = {i:dc.models.GraphConvModel(n_tasks=3, mode='regression', batch_size=50, random_seed=i) for i in range(100)} #map each model to its seed
        self.history = []
        self.samples_seen = None
              
        self.bace_range = (4, math.inf)
        self.esol_range = (-5, math.inf)
        self.logD_range = (-0.4, 5.6)
        
        ###train the ensemble on a subset that binds bace poorly
        if N > dataset.shape[0]:
            N = dataset.shape[0]
        rand_indices = random.sample(range(dataset.shape[0]), k=N) #select random row indices

        init_ensemble_dataset = pd.DataFrame()
        for idx in rand_indices:
            init_ensemble_dataset = init_ensemble_dataset.append( low_bace_dataset.iloc[idx], ignore_index=True )
        
        self.samples_seen = init_ensemble_dataset ### collect the examples seen during initial training
        
        train_ensemble(init_ensemble_dataset) #train ensemble on initialization

        
    def get_bace_score(x):
        #higher bace => higher score
        return np.where(x < self.bace_range[0], 0.2*x-0.8, 0.05*x-0.2) #decrease penalty once score > lower end of range

    def get_logD_score(x):
        #logD within range is not penalized, 
        x = np.where(x < self.logD_range[0], x - np.absolute(x-self.logD_range[0]), x) #handle lower end of range
        return np.where(x > self.logD_range[1], x - np.absolute(x-self.logD_range[1]), x) #handle upper end of range
   
    def get_esol_score(x):
        return np.where(x < self.esol_range[0], x - np.absolute(x-self.logD_range[1])**2, x)
        
    def get_goodness_score(bace, logD, esol):
        return get_bace_score(bace) + get_logD_score(logD) + get_esol_score(esol)
    
    
    def train_ensemble(self, dataset):
        for model in self.ensemble:
            train_model(model, dataset)
    
    
    def score_and_select_top():
        featurizer = dc.feat.ConvMolFeaturizer()
        loader = dc.data.CSVLoader(tasks=["bace", "logD", "esol"], smiles_field="SMILES", featurizer=featurizer)
        dataset = loader.featurize()
        
        predicted = np.zeroes( (len(ground_truth_dataset),3) )
        for model in self.ensemble.values():
            predicted += model.predict(ground_truth_dataset)
                              
        predicted /= len(self.ensemble) #take the average of the model predictions
            
        results_df = pd.DataFrame()
        results_df["SMILES"] = ground_truth_dataset["SMILES"]
        
        bace = results[:,0]
        logD = results[:,1]
        esol = results[:,2]
        goodness = self.get_goodness_score(bace, logD, esol)
        
        results_df["bace"] = bace
        results_df["logD"] = logD
        results_df["esol"] = esol
        results_df["goodness"] = goodness
        print(results_df)
        
        results_df.sort_values(by="goodness")
        return results_df[-self.M:]
     
        return predicted[-self.M:,:] #return molecules with M highest goodness scores
    
    
    def run():
        raise NotImplementedError
        while not STOP_CONDITION:
            score_and_select_top()
            #record history
            #update dataset
            train_ensemble(self.samples_seen)


In [None]:
###define property preference fxns and ranges, cost fxns

def get_bace_score():
    raise NotImplementedError
    
def get_esol_score():
    raise NotImplementedError
    
def get_logD_score():
    raise NotImplementedError
    
def get_goodness_score():
    raise NotImplementedError

In [None]:
"""
Step 1: load ground truth models and ensemble

Step 2: train ensemble on N random data points (including ground truth values)

Step 3: score all of the 10K molecules using the ensemble

Step 4: take ("buy") the top M, and "assess them experimentally" (get their ground truth values)

Step 5: add those samples to the training/seen set

Step 6: retrain the ensemble

Step 7: repeat (make 2-6 repeatable)

Step 8: add some loops over N and M to generate plots of Hx vs N,M
"""

In [None]:
for n in N_set:
    for m in M_set:
        E = Experimenter(n, m)
        e.run()