IMPORT NECESSARY MODULES

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../..')

import os
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import tensorflow as tf
import gpflow
from src import models
from src import dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from src.helpers import count_cf_bonds, create_morgan_space
from src.graphnn import mol2graph
import pickle

2024-10-14 22:57:50.863647: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-14 22:57:50.869918: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-14 22:57:50.874395: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-14 22:57:50.886274: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-14 22:57:50.905408: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

PREPROCESSING DATA

In [7]:

# dir = os.getcwd()
# dir
def generate_dataset(splitter, name):

    ldtoxdb = pd.read_csv('../../data/ldtoxdb-mordred.csv').dropna(axis=1)
    ldtoxdb['rd_mol'] = ldtoxdb.SMI.apply(Chem.MolFromSmiles)
    ldtoxdb=ldtoxdb.dropna(subset=['rd_mol'])
    ldtoxdb['n_cf_bonds'] = ldtoxdb.rd_mol.apply(count_cf_bonds)
    ldtoxdb['mol_wt'] = ldtoxdb.rd_mol.apply(Chem.Descriptors.MolWt)
    ldtoxdb['is_pfas_like'] = ldtoxdb['n_cf_bonds'] >= 2
 
    pfas8k = pd.read_csv('../../data/pfas8k-mordred.csv')
    pfas8k['mol']=pfas8k.SMILES.apply(Chem.MolFromSmiles)
    pfas8k = pfas8k.dropna(subset=['mol'])
    pfas8k['canon_smi'] = pfas8k['mol'].apply(Chem.MolToSmiles)
    pfas8k = pfas8k.drop('mol',axis=1)
    ldtoxdb['is_pfas'] = ldtoxdb.SMI.isin(pfas8k.canon_smi)

    mordred = ldtoxdb.columns[5:-5]
    
    # comment next 4 lines for speed if processing data only for non-benchmarks
    ecfp4096 = np.array(ldtoxdb.rd_mol.apply(create_morgan_space(nbits=4096, r=2)).tolist())
    ecfp2048 = np.array(ldtoxdb.rd_mol.apply(create_morgan_space(nbits=2048, r=1)).tolist())
    ecfp2048r6 = np.array(ldtoxdb.rd_mol.apply(create_morgan_space(nbits=2048, r=6)).tolist())
    # graph = np.array(ldtoxdb.rd_mol.apply(mol2graph.mol2torchdata).tolist())
    graph = np.array(ldtoxdb.rd_mol.apply(mol2graph.mol2torchdata).tolist(),dtype=object)


    # for stratified splitting
    bins = pd.cut(ldtoxdb[['NeglogLD50']].to_numpy().reshape(-1), bins=5, labels=False)

    for foldno, (train_idx, test_idx) in enumerate(splitter.split(ldtoxdb, bins)):
        prefix = '../../data/preprocessed/%s/fold%d' % (name, foldno)

        train = ldtoxdb.iloc[train_idx]
        test = ldtoxdb.iloc[test_idx]

        pfas_like_pfas_excluded_train = train.loc[(train.is_pfas_like & ~train.is_pfas)]
        pfas_like_pfas_excluded_test = test.loc[(test.is_pfas_like & ~test.is_pfas)]
        pfas_like_train = train.loc[train.is_pfas_like]
        pfas_like_test = test.loc[test.is_pfas_like]
        pfas_like_excluded_train = train.loc[~train.is_pfas_like]
        pfas_like_excluded_test = test.loc[~test.is_pfas_like]
        pfas_excluded_train = train.loc[~train.is_pfas]
        pfas_excluded_test = test.loc[~test.is_pfas]
        pfas_train = train.loc[train.is_pfas]
        pfas_test = test.loc[test.is_pfas]

        pfas_like_train_idx = pfas_like_train.index
        pfas_like_test_idx = pfas_like_test.index

        # SMILES
        np.savez_compressed(prefix + '_smiles_test', smiles=test[['SMI']].to_numpy())
        np.savez_compressed(prefix + '_smiles_train', smiles=train[['SMI']].to_numpy())

        np.savez_compressed(prefix + '_smiles_test_pfas_like', smiles=pfas_like_test[['SMI']].to_numpy())
        np.savez_compressed(prefix + '_smiles_train_pfas_like', smiles=pfas_like_train[['SMI']].to_numpy())

        np.savez_compressed(prefix + '_smiles_test_pfas_like_excluded', smiles=pfas_like_excluded_test[['SMI']].to_numpy())
        np.savez_compressed(prefix + '_smiles_train_pfas_like_excluded', smiles=pfas_like_excluded_train[['SMI']].to_numpy())

        np.savez_compressed(prefix + '_smiles_test_pfas_like_pfas_excluded', smiles=pfas_like_pfas_excluded_test[['SMI']].to_numpy())
        np.savez_compressed(prefix + '_smiles_train_pfas_like_pfas_excluded', smiles=pfas_like_pfas_excluded_train[['SMI']].to_numpy())

        np.savez_compressed(prefix + '_smiles_test_pfas_excluded', smiles=pfas_excluded_test[['SMI']].to_numpy())
        np.savez_compressed(prefix + '_smiles_train_pfas_excluded', smiles=pfas_excluded_train[['SMI']].to_numpy())

        np.savez_compressed(prefix + '_smiles_test_pfas', smiles=pfas_test[['SMI']].to_numpy())
        np.savez_compressed(prefix + '_smiles_train_pfas', smiles=pfas_train[['SMI']].to_numpy())

        # Outputs
        np.savez_compressed(prefix + '_y_test', y=test[['NeglogLD50']].to_numpy())
        np.savez_compressed(prefix + '_y_train', y=train[['NeglogLD50']].to_numpy())

        np.savez_compressed(prefix + '_y_test_pfas_like', y=pfas_like_test[['NeglogLD50']].to_numpy())
        np.savez_compressed(prefix + '_y_train_pfas_like', y=pfas_like_train[['NeglogLD50']].to_numpy())

        np.savez_compressed(prefix + '_y_test_pfas_like_excluded', y=pfas_like_excluded_test[['NeglogLD50']].to_numpy())
        np.savez_compressed(prefix + '_y_train_pfas_like_excluded', y=pfas_like_excluded_train[['NeglogLD50']].to_numpy())

        np.savez_compressed(prefix + '_y_test_pfas_like_pfas_excluded', y=pfas_like_pfas_excluded_test[['NeglogLD50']].to_numpy())
        np.savez_compressed(prefix + '_y_train_pfas_like_pfas_excluded', y=pfas_like_pfas_excluded_train[['NeglogLD50']].to_numpy())

        np.savez_compressed(prefix + '_y_test_pfas_excluded', y=pfas_excluded_test[['NeglogLD50']].to_numpy())
        np.savez_compressed(prefix + '_y_train_pfas_excluded', y=pfas_excluded_train[['NeglogLD50']].to_numpy())

        np.savez_compressed(prefix + '_y_test_pfas', y=pfas_test[['NeglogLD50']].to_numpy())
        np.savez_compressed(prefix + '_y_train_pfas', y=pfas_train[['NeglogLD50']].to_numpy())

        # Mordred inputs
        col_selector = VarianceThreshold()
        np.savez_compressed(prefix + '_mordred_x_train', x=col_selector.fit_transform(train[mordred]).astype(np.float32))
        np.savez_compressed(prefix + '_mordred_x_test', x=col_selector.transform(test[mordred]).astype(np.float32))

        np.savez_compressed(prefix + '_mordred_x_train_pfas_like', x=col_selector.transform(pfas_like_train[mordred]).astype(np.float32))
        np.savez_compressed(prefix + '_mordred_x_test_pfas_like', x=col_selector.transform(pfas_like_test[mordred]).astype(np.float32))

        np.savez_compressed(prefix + '_mordred_x_train_pfas_like_excluded', x=col_selector.transform(pfas_like_excluded_train[mordred]).astype(np.float32))
        np.savez_compressed(prefix + '_mordred_x_test_pfas_like_excluded', x=col_selector.transform(pfas_like_excluded_test[mordred]).astype(np.float32))

        np.savez_compressed(prefix + '_mordred_x_train_pfas_like_pfas_excluded', x=col_selector.transform(pfas_like_pfas_excluded_train[mordred]).astype(np.float32))
        np.savez_compressed(prefix + '_mordred_x_test_pfas_like_pfas_excluded', x=col_selector.transform(pfas_like_pfas_excluded_test[mordred]).astype(np.float32))

        np.savez_compressed(prefix + '_mordred_x_train_pfas_excluded', x=col_selector.transform(pfas_excluded_train[mordred]).astype(np.float32))
        np.savez_compressed(prefix + '_mordred_x_test_pfas_excluded', x=col_selector.transform(pfas_excluded_test[mordred]).astype(np.float32))

        np.savez_compressed(prefix + '_mordred_x_train_pfas', x=col_selector.transform(pfas_train[mordred]).astype(np.float32))
        np.savez_compressed(prefix + '_mordred_x_test_pfas', x=col_selector.transform(pfas_test[mordred]).astype(np.float32))

        # We need these for inference later on
        indices = col_selector.get_support(indices=True)
        np.savez_compressed(prefix + '_mordred_x_cols', cols=train[mordred].iloc[[0], indices].columns)
        # continue

        # ECFP-4096 inputs
        np.savez_compressed(prefix + '_ecfp_4096_x_train', x=col_selector.fit_transform(ecfp4096[train_idx]).astype(np.float32))
        np.savez_compressed(prefix + '_ecfp_4096_x_test', x=col_selector.transform(ecfp4096[test_idx]).astype(np.float32))

        np.savez_compressed(prefix + '_ecfp_4096_x_train_pfas_like', x=col_selector.transform(ecfp4096[pfas_like_train_idx]).astype(np.float32))
        np.savez_compressed(prefix + '_ecfp_4096_x_test_pfas_like', x=col_selector.transform(ecfp4096[pfas_like_test_idx]).astype(np.float32))

        # ECFP-2048 inputs
        np.savez_compressed(prefix + '_ecfp_2048_x_train', x=col_selector.fit_transform(ecfp2048[train_idx]).astype(np.float32))
        np.savez_compressed(prefix + '_ecfp_2048_x_test', x=col_selector.transform(ecfp2048[test_idx]).astype(np.float32))

        np.savez_compressed(prefix + '_ecfp_2048_x_train_pfas_like', x=col_selector.transform(ecfp2048[pfas_like_train_idx]).astype(np.float32))
        np.savez_compressed(prefix + '_ecfp_2048_x_test_pfas_like', x=col_selector.transform(ecfp2048[pfas_like_test_idx]).astype(np.float32))

        # ECFP-2048 inputs
        np.savez_compressed(prefix + '_ecfp_2048r6_x_train', x=col_selector.fit_transform(ecfp2048r6[train_idx]).astype(np.float32))
        np.savez_compressed(prefix + '_ecfp_2048r6_x_test', x=col_selector.transform(ecfp2048r6[test_idx]).astype(np.float32))

        np.savez_compressed(prefix + '_ecfp_2048r6_x_train_pfas_like', x=col_selector.transform(ecfp2048r6[pfas_like_train_idx]).astype(np.float32))
        np.savez_compressed(prefix + '_ecfp_2048r6_x_test_pfas_like', x=col_selector.transform(ecfp2048r6[pfas_like_test_idx]).astype(np.float32))

        # GP Convienience
        col_selector2 = VarianceThreshold()
        np.savez_compressed(prefix + '_gp_x_train', x=col_selector.fit_transform(train[mordred]).astype(np.float32),
                                                            x2=col_selector2.fit_transform(ecfp4096[train_idx]).astype(np.float32))
        np.savez_compressed(prefix + '_gp_x_test', x=col_selector.transform(test[mordred]).astype(np.float32),
                                                           x2=col_selector2.transform(ecfp4096[test_idx]).astype(np.float32))
        # GCN
        # Graph featurized on fly

def main():
    np.random.seed(9700)
    generate_dataset(splitter=KFold(n_splits=5, shuffle=True), name='random')

    np.random.seed(9700)
    generate_dataset(splitter=StratifiedKFold(n_splits=5, shuffle=True), name='stratified')

if __name__ == '__main__':
    main()

[23:01:12] Explicit valence for atom # 1 Si, 4, is greater than permitted
[23:01:14] Explicit valence for atom # 1 Cl, 3, is greater than permitted
  data = Data(x=torch.tensor(node_f, dtype=torch.float),
[23:03:24] Explicit valence for atom # 1 Si, 4, is greater than permitted
[23:03:25] Explicit valence for atom # 1 Cl, 3, is greater than permitted


LOAD DATASET

In [8]:
def load_dataset(encoding, fold, stratified, subset, path_prefix=''):

    base_path = os.getcwd() + path_prefix + '../data/preprocessed/{type}/fold{no}_'.format(
        type = 'random' if not stratified else 'stratified',
        no = fold
    )

    x_paths = base_path + '{encoding}' + '_{set}{subset}.npz'
    y_paths = base_path + '{set}{subset}.npz'

    subset = ('_' + subset) if subset is not None else ''

    y_train = y_paths.format(set = 'y_train', subset = subset)
    smiles_train = y_paths.format(set = 'smiles_train', subset = subset)

    x_train = x_paths.format(encoding=encoding, set = 'x_train', subset = subset)

    if encoding == 'smiles':
        x_train = smiles_train

    x_train_load = np.load(x_train, allow_pickle=True)
    x_train_load = [x_train_load[f] for f in x_train_load.files]

    if len(x_train_load) == 1:
        x_train_load = x_train_load[0]


    y_test = y_paths.format(set = 'y_test', subset = subset)
    smiles_test = y_paths.format(set = 'smiles_test', subset = subset)

    x_test = x_paths.format(encoding=encoding, set = 'x_test', subset = subset)
    
    if encoding == 'smiles':
        x_test = smiles_test
        
    x_test_load = np.load(x_test, allow_pickle=True)
    x_test_load = [x_test_load[f] for f in x_test_load.files]

    if len(x_test_load) == 1:
        x_test_load = x_test_load[0]

    train = (x_train_load, np.load(y_train, allow_pickle=True)['y'], np.load(smiles_train, allow_pickle=True)['smiles'])
    test = (x_test_load, np.load(y_test, allow_pickle=True)['y'], np.load(smiles_test, allow_pickle=True)['smiles'])

    return (train, test)

EXPERIMENTAL SETUP


In [9]:

scaler = StandardScaler()
class LD50UnitConverter():
    def convert_to_mgkg(self, neglogld50s, smiles):

        for neglogld50, smile in zip(neglogld50s, smiles):
            molwt = Descriptors.MolWt(Chem.MolFromSmiles(smile[0]))
            yield (10**(-1*neglogld50[0]))*1000*molwt


    def convert_to_epa(self, neglogld50s, smiles):
        mgkg = list(self.convert_to_mgkg(neglogld50s=neglogld50s, smiles=smiles))

        return pd.cut(mgkg, labels=(0,1,2,3), bins=(-np.inf,50,500,5000, np.inf))


    def convert_to_epa(self, neglogld50s, smiles):
        mgkg = list(self.convert_to_mgkg(neglogld50s=neglogld50s, smiles=smiles))
        mgkg = [x for x in mgkg if x is not None]  # Remove None values
        return pd.cut(mgkg, labels=(0,1,2,3), bins=(-np.inf,50,500,5000, np.inf))


class CrossValidator():
    def __init__(self, splits = 5, sampling_type = 'random'):
        self.sampling_stratified = sampling_type == 'stratified'
        self.splits = splits

    def get_folds(self, encoding, subset = None):
       for fold in range(self.splits):
            yield dataset.load_dataset(encoding, fold, stratified=self.sampling_stratified, subset=subset,
                path_prefix = '/../')

MODEL SETUP

In [17]:
class RF:
    seed = 9700
    #gritta changed : reset n_estimators and max_depth because of memory issues
    # n_estimators = 4096
    # max_depth = 32
    n_estimators = 1000
    max_depth = 20
    min_samples_split = 2
    min_samples_leaf = 1

    def fit(self, x_train, y_train):
        np.random.seed(self.seed)

        self.estimator = RandomForestRegressor(n_estimators = self.n_estimators,
                                                max_depth = self.max_depth,
                                                min_samples_split = self.min_samples_split,
                                                min_samples_leaf = self.min_samples_leaf, 
                                                n_jobs=-1)

        self.estimator.fit(x_train, y_train.ravel())

        return self

    def predict(self, x):
        if self.estimator is None:
            raise NotImplementedError()

        return self.estimator.predict(x).reshape(-1,1)
    
    def save_weights(self, fn):
        with open(fn, 'wb') as file:
            pickle.dump(self.estimator, file)

    def load_weights(self, fn):
        with open(fn, 'rb') as file:
            self.estimator = pickle.load(file)


class GP:
    def fit(self, x_train, y_train):
        kernels = []
        i = 0

        for no, (x, reducer, k) in enumerate(zip(x_train,
                                self.rf_feature_selectors,
                                self.rf_feature_reduce_to)):

            indices = (-reducer.estimator.feature_importances_).argsort()[:k]
            x_train[no] = x[:,indices]
            
            kernels.append(gpflow.kernels.RBF(active_dims=i+np.arange(k)))

            i += k

        x_train = np.hstack(x_train)
        kernel = gpflow.kernels.Sum(kernels)


        self.model = gpflow.models.GPR(data=(x_train.astype(np.float64), y_train.astype(np.float64)), kernel=kernel,
                                        mean_function=None)

        opt = gpflow.optimizers.Scipy()
        opt.minimize(lambda: -self.model.log_marginal_likelihood(), self.model.trainable_variables,
            options={'maxiter': 500})
        
    def predict(self, x_in):
        for no, (x, reducer, k) in enumerate(zip(x_in,
                                self.rf_feature_selectors,
                                self.rf_feature_reduce_to)):

            indices = (-reducer.estimator.feature_importances_).argsort()[:k]
            x_in[no] = x[:,indices]

        x = np.hstack(x_in)
        return self.model.predict_y(x.astype(np.float64))[0]

    def save_weights(self, fn):
        checkpoint = tf.train.Checkpoint(a=self.model)
        manager = tf.train.CheckpointManager(checkpoint, fn, max_to_keep=9999)
        
        manager.save()



BENCHMARK SETUP


In [18]:
_benchmarks = {
    'dnn_mordred': {'model': models.DNN_Mordred, 'encoding': 'mordred'},
    'dnn_ecfp': {'model': models.DNN_ECFP, 'encoding': 'ecfp_2048'},
    'rf_mordred': {'model': RF, 'encoding': 'mordred'},
    'rf_ecfp': {'model': RF, 'encoding': 'ecfp_4096'},
    'rf_nmf_ecfp': {'model': models.RF_NMF_ECFP, 'encoding': 'ecfp_4096'},
    'gp': {'model': GP, 'encoding': 'gp'},
    'gcn': {'model': models.GCN, 'encoding': 'smiles'}
}

# benchmarks to train/validate, check _benchmark_dict for options
run_benchmarks = ['gp'] 

# `random` or `stratified`
sampling_type = 'random'

kfold = CrossValidator(
    splits=5, # dont change without re-running data preprocessing
    sampling_type = sampling_type,
)

converter = LD50UnitConverter()

TRAINING/PREDICTING


In [None]:
for identity in run_benchmarks:  
    benchmark = _benchmarks[identity]
    
    folds = enumerate(kfold.get_folds(benchmark['encoding']))
    
    for fold_no, (train, test) in folds:           
        x_train, y_train, smiles_train = train
        x_test, y_test, smiles_test = test
        
        
        y_train = scaler.fit_transform(y_train)
        
        model = _benchmarks[identity]['model']()
        
        # <Gaussian process has special step for selecting parameters based on
        # RF benchmark models for ECFP and Mordred
        if identity == 'gp':
            fn = 'rf_mordred' + str(fold_no) + '_' + sampling_type
            mordred_rf = _benchmarks['rf_mordred']['model']()
            mordred_rf.load_weights('../../data/benchmark-models/chkpts/%s.chkpt' % fn)
            
            fn = 'rf_ecfp' + str(fold_no) + '_' + sampling_type
            ecfp_rf = _benchmarks['rf_ecfp']['model']()
            ecfp_rf.load_weights('../../data/benchmark-models/chkpts/%s.chkpt' % fn)
            
            model.rf_feature_selectors = (mordred_rf,ecfp_rf)
            model.rf_feature_reduce_to = (10,200)
            # Ensure x_train is a list of numpy arrays
            if not isinstance(x_train, list):
                x_train = [x_train]
            x_train = [x.astype(np.float64) for x in x_train]
            y_train = y_train.astype(np.float64)
        
        # End of special GP step>
        
        model.fit(x_train, y_train)
        
        #save model
        fn = identity + str(fold_no) + '_' + sampling_type
        # mordred_rf.save_weights('../../data/benchmark-models/chkpts/rf_mordred0_random.chkpt')
        model.save_weights('../../data/benchmark-models/chkpts/%s.chkpt' % fn)

        
        
        y_hat = scaler.inverse_transform(model.predict(x_test))
    
                
        results = pd.DataFrame({
            'smiles': smiles_test.flatten(),
            'prediction_neglogld50': y_hat.flatten(),
            'prediction_mgkg': converter.convert_to_mgkg(y_hat, smiles_test),
            'prediction_epa': converter.convert_to_epa(y_hat, smiles_test),
        
            'actual_neglogld50': y_test.flatten(),
            'actual_mgkg': converter.convert_to_mgkg(y_test, smiles_test),
            'actual_epa': converter.convert_to_epa(y_test, smiles_test),
        })
        
        results.to_csv('../../data/benchmark-models/%s_predictions.csv' % fn)

#RESULTS COMPARISON

In [20]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

compare_benchmarks = ['rf_mordred','rf_ecfp','gp'] 

aggregated = pd.DataFrame()
    
for identity in compare_benchmarks:
    benchmark = _benchmarks[identity]
    
    folds = enumerate(kfold.get_folds(benchmark['encoding']))
    
    for fold_no, data in folds:
        fn = identity + str(fold_no) + '_' + sampling_type
        
        validation = pd.read_csv('../../data/benchmark-models/%s_predictions.csv' % fn)

        ##changed here becase pandas does not use aggregrate anymore..so changed to pd.concat
        # Create a DataFrame for the new row to be added
        new_row = pd.DataFrame({
            'benchmark': [identity],
            'r2': [r2_score(validation['actual_neglogld50'], validation['prediction_neglogld50'])],
            'mae': [mean_absolute_error(validation['actual_neglogld50'], validation['prediction_neglogld50'])],
            'rmse': [mean_squared_error(validation['actual_neglogld50'], validation['prediction_neglogld50'], squared=False)],
            'accuracy': [np.sum(validation['actual_epa'] == validation['prediction_epa']) / len(validation)]
        })
        aggregated = pd.concat([aggregated, new_row], ignore_index=True)



In [21]:
aggregated.pivot_table(index='benchmark', aggfunc=np.mean)

  aggregated.pivot_table(index='benchmark', aggfunc=np.mean)


Unnamed: 0_level_0,accuracy,mae,r2,rmse
benchmark,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gp,0.654113,0.375403,0.622302,0.541276
rf_ecfp,0.596113,0.448352,0.524401,0.607475
rf_mordred,0.646534,0.379422,0.633562,0.533226
