# MolMap ensemble (dual-path approach)

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]='0'

import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

### Load data

In [2]:
'Set the file name of your data'
data_name = "CYP450.csv.gz"

In [3]:
import numpy as np
import pandas as pd

data = pd.read_csv("../../../data/" + data_name, compression='gzip')

Set up (adapt according to your data)

In [4]:
# keep desired columns
data = data[['smiles', 'label_2c9']]

# drop molecules with NaN activity
data = data.dropna(subset = ["label_2c9"])

# set SMILES
smi = data['smiles'].tolist()

# set Y
Y = pd.get_dummies(data['label_2c9']).values

# number of active and inactive molecules
print("Inactive (0):", Y[:,1].tolist().count(0))
print("Active (1):", Y[:,1].tolist().count(1))

Inactive (0): 7429
Active (1): 2621


### MolMap Ensemble

In [5]:
'Set size of class 0 (minority class) for each model'
size_0 = 500

partitions = int(np.floor(Y[:,1].tolist().count(0)/size_0))
print("Total number of models:", partitions)

Total number of models: 14


In [6]:
'Get classes 0 and 1 in two different variables'
class_0 = data[data["label_2c9"] == 0.0]
class_1 = data[data["label_2c9"] == 1.0]

# shuffle class 0 before dividing it into parts
class_0 = class_0.sample(frac=1).reset_index(drop=True)

In [7]:
from molmap import MolMap
from molmap import feature
from molmap.model import MultiClassEstimator
import sys
sys.path.append("../../../src")
from utils import Rdsplit

aucs = []
# for each model:
for i in range (0, partitions):
    
    # subset of data for model i
    subdata = pd.concat([class_0[i*size_0:(i+1)*size_0], class_1])
    
    # get smiles
    smi = subdata['smiles']
    
    # set Y
    Y = pd.get_dummies(subdata['label_2c9']).values
    
    # compute MolDs
    mp1 = MolMap(ftype='descriptor', metric='cosine',)
    mp1.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15)
    
    # compute FFs
    bitsinfo = feature.fingerprint.Extraction().bitsinfo
    flist = bitsinfo[bitsinfo.Subtypes.isin(['PubChemFP', 'MACCSFP', 'PharmacoErGFP'])].IDs.tolist()
    mp2 = MolMap(ftype = 'fingerprint', fmap_type = 'scatter', flist = flist) 
    mp2.fit(method = 'umap',  min_dist = 0.1, n_neighbors = 15, verbose = 0)
    
    # get Fmaps
    X1 = mp1.batch_transform(smi)
    X2 = mp2.batch_transform(smi)
    
    # split train, val, test
    train_idx, valid_idx, test_idx = Rdsplit(subdata, random_state = 888)
    trainX = (X1[train_idx], X2[train_idx])
    validX = (X1[valid_idx], X2[valid_idx])
    testX = (X1[test_idx], X2[test_idx])
    trainY = Y[train_idx]
    validY = Y[valid_idx]
    testY = Y[test_idx]
    
    # model 
    clf = MultiClassEstimator(n_outputs=trainY.shape[1], 
                              fmap_shape1 = X1.shape[1:],
                              fmap_shape2 = X2.shape[1:],
                              metric='ROC', 
                              dense_layers = [128, 64],  gpuid = 0, epochs = 100,
                              ) 
    # fit model
    clf.fit(trainX, trainY, validX, validY)
    
    # score 
    auc = clf.score(testX, testY) 
    aucs.append(auc)

2022-06-29 11:55:59,775 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2022-06-29 11:56:02,605 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m
2022-06-29 11:56:09,629 - [32mINFO[0m - [bidd-molmap][0m - Applying naive scatter feature map...[0m
2022-06-29 11:56:09,648 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 3121/3121 [05:08<00:00, 10.10it/s]
100%|##########| 3121/3121 [04:14<00:00, 12.28it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4767 - val_loss: 0.4741; auc: 0.6944 - val_auc: 0.6896                                                                                                    
epoch: 0002, loss: 0.4208 - val_loss: 0.4471; auc: 0.7616 - val_auc: 0.7355                                                                                                    
epoch: 0003, loss: 0.4083 - val_loss: 0.4359; auc: 0.7842 - val_auc: 0.7542                                                                                                    
epoch: 0004, loss: 0.3916 - val_loss: 0.4289; auc: 0.7930 - val_auc: 0.7622                                                                                                    
epoch: 0005, loss: 0.3789 - val_loss: 0.4101; auc: 0.8110 - val_auc: 0.7782                                                                                 

100%|##########| 3121/3121 [03:51<00:00, 13.47it/s]
100%|##########| 3121/3121 [02:24<00:00, 21.53it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4764 - val_loss: 0.4642; auc: 0.7003 - val_auc: 0.7660                                                                                                    
epoch: 0002, loss: 0.4200 - val_loss: 0.4364; auc: 0.7637 - val_auc: 0.7982                                                                                                    
epoch: 0003, loss: 0.4066 - val_loss: 0.4199; auc: 0.7910 - val_auc: 0.8139                                                                                                    
epoch: 0004, loss: 0.3911 - val_loss: 0.4025; auc: 0.8047 - val_auc: 0.8220                                                                                                    
epoch: 0005, loss: 0.3767 - val_loss: 0.3834; auc: 0.8165 - val_auc: 0.8267                                                                                 

100%|##########| 3121/3121 [02:30<00:00, 20.79it/s]
100%|##########| 3121/3121 [01:42<00:00, 30.44it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4761 - val_loss: 0.4760; auc: 0.7015 - val_auc: 0.6766                                                                                                    
epoch: 0002, loss: 0.4202 - val_loss: 0.4514; auc: 0.7732 - val_auc: 0.7252                                                                                                    
epoch: 0003, loss: 0.4083 - val_loss: 0.4425; auc: 0.7979 - val_auc: 0.7369                                                                                                    
epoch: 0004, loss: 0.3941 - val_loss: 0.4375; auc: 0.8149 - val_auc: 0.7447                                                                                                    
epoch: 0005, loss: 0.3771 - val_loss: 0.4197; auc: 0.8295 - val_auc: 0.7515                                                                                 

100%|##########| 3121/3121 [02:26<00:00, 21.29it/s]
100%|##########| 3121/3121 [01:36<00:00, 32.24it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4780 - val_loss: 0.4717; auc: 0.6743 - val_auc: 0.7017                                                                                                    
epoch: 0002, loss: 0.4230 - val_loss: 0.4453; auc: 0.7533 - val_auc: 0.7680                                                                                                    
epoch: 0003, loss: 0.4121 - val_loss: 0.4313; auc: 0.7824 - val_auc: 0.7984                                                                                                    
epoch: 0004, loss: 0.3998 - val_loss: 0.4196; auc: 0.7971 - val_auc: 0.8149                                                                                                    
epoch: 0005, loss: 0.3876 - val_loss: 0.3956; auc: 0.8074 - val_auc: 0.8266                                                                                 

100%|##########| 3121/3121 [02:33<00:00, 20.27it/s]
100%|##########| 3121/3121 [01:37<00:00, 31.97it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4768 - val_loss: 0.4702; auc: 0.7047 - val_auc: 0.6874                                                                                                    
epoch: 0002, loss: 0.4205 - val_loss: 0.4422; auc: 0.7760 - val_auc: 0.7469                                                                                                    
epoch: 0003, loss: 0.4064 - val_loss: 0.4257; auc: 0.8028 - val_auc: 0.7760                                                                                                    
epoch: 0004, loss: 0.3886 - val_loss: 0.4108; auc: 0.8170 - val_auc: 0.7945                                                                                                    
epoch: 0005, loss: 0.3710 - val_loss: 0.3874; auc: 0.8307 - val_auc: 0.8102                                                                                 

100%|##########| 3121/3121 [02:33<00:00, 20.35it/s]
100%|##########| 3121/3121 [01:37<00:00, 32.01it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4774 - val_loss: 0.4753; auc: 0.6809 - val_auc: 0.6744                                                                                                    
epoch: 0002, loss: 0.4223 - val_loss: 0.4493; auc: 0.7510 - val_auc: 0.7218                                                                                                    
epoch: 0003, loss: 0.4108 - val_loss: 0.4386; auc: 0.7780 - val_auc: 0.7491                                                                                                    
epoch: 0004, loss: 0.3988 - val_loss: 0.4288; auc: 0.7889 - val_auc: 0.7638                                                                                                    
epoch: 0005, loss: 0.3868 - val_loss: 0.4210; auc: 0.8057 - val_auc: 0.7774                                                                                 

100%|##########| 3121/3121 [02:22<00:00, 21.86it/s]
100%|##########| 3121/3121 [01:36<00:00, 32.35it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4775 - val_loss: 0.4719; auc: 0.6889 - val_auc: 0.7051                                                                                                    
epoch: 0002, loss: 0.4213 - val_loss: 0.4441; auc: 0.7613 - val_auc: 0.7861                                                                                                    
epoch: 0003, loss: 0.4086 - val_loss: 0.4291; auc: 0.7877 - val_auc: 0.8119                                                                                                    
epoch: 0004, loss: 0.3931 - val_loss: 0.4153; auc: 0.7992 - val_auc: 0.8195                                                                                                    
epoch: 0005, loss: 0.3791 - val_loss: 0.3965; auc: 0.8153 - val_auc: 0.8341                                                                                 

100%|##########| 3121/3121 [02:22<00:00, 21.86it/s]
100%|##########| 3121/3121 [01:37<00:00, 32.08it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4766 - val_loss: 0.4784; auc: 0.6904 - val_auc: 0.6678                                                                                                    
epoch: 0002, loss: 0.4222 - val_loss: 0.4513; auc: 0.7551 - val_auc: 0.7207                                                                                                    
epoch: 0003, loss: 0.4109 - val_loss: 0.4409; auc: 0.7807 - val_auc: 0.7471                                                                                                    
epoch: 0004, loss: 0.3967 - val_loss: 0.4327; auc: 0.7949 - val_auc: 0.7614                                                                                                    
epoch: 0005, loss: 0.3837 - val_loss: 0.4133; auc: 0.8112 - val_auc: 0.7743                                                                                 

100%|##########| 3121/3121 [02:22<00:00, 21.90it/s]
100%|##########| 3121/3121 [01:36<00:00, 32.32it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4768 - val_loss: 0.4751; auc: 0.6738 - val_auc: 0.6703                                                                                                    
epoch: 0002, loss: 0.4225 - val_loss: 0.4495; auc: 0.7456 - val_auc: 0.7156                                                                                                    
epoch: 0003, loss: 0.4111 - val_loss: 0.4381; auc: 0.7810 - val_auc: 0.7442                                                                                                    
epoch: 0004, loss: 0.3992 - val_loss: 0.4305; auc: 0.7980 - val_auc: 0.7618                                                                                                    
epoch: 0005, loss: 0.3871 - val_loss: 0.4134; auc: 0.8110 - val_auc: 0.7775                                                                                 

100%|##########| 3121/3121 [02:22<00:00, 21.95it/s]
100%|##########| 3121/3121 [01:37<00:00, 32.14it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4763 - val_loss: 0.4681; auc: 0.7084 - val_auc: 0.7085                                                                                                    
epoch: 0002, loss: 0.4187 - val_loss: 0.4430; auc: 0.7774 - val_auc: 0.7444                                                                                                    
epoch: 0003, loss: 0.4037 - val_loss: 0.4300; auc: 0.8005 - val_auc: 0.7605                                                                                                    
epoch: 0004, loss: 0.3863 - val_loss: 0.4183; auc: 0.8133 - val_auc: 0.7692                                                                                                    
epoch: 0005, loss: 0.3687 - val_loss: 0.4028; auc: 0.8287 - val_auc: 0.7760                                                                                 

100%|##########| 3121/3121 [02:22<00:00, 21.83it/s]
100%|##########| 3121/3121 [01:36<00:00, 32.30it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4771 - val_loss: 0.4753; auc: 0.6947 - val_auc: 0.6924                                                                                                    
epoch: 0002, loss: 0.4214 - val_loss: 0.4461; auc: 0.7576 - val_auc: 0.7615                                                                                                    
epoch: 0003, loss: 0.4095 - val_loss: 0.4320; auc: 0.7782 - val_auc: 0.7816                                                                                                    
epoch: 0004, loss: 0.3946 - val_loss: 0.4194; auc: 0.7915 - val_auc: 0.7919                                                                                                    
epoch: 0005, loss: 0.3825 - val_loss: 0.3997; auc: 0.8048 - val_auc: 0.8036                                                                                 

100%|##########| 3121/3121 [02:21<00:00, 21.99it/s]
100%|##########| 3121/3121 [01:36<00:00, 32.27it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4767 - val_loss: 0.4691; auc: 0.7029 - val_auc: 0.7304                                                                                                    
epoch: 0002, loss: 0.4200 - val_loss: 0.4413; auc: 0.7637 - val_auc: 0.7861                                                                                                    
epoch: 0003, loss: 0.4079 - val_loss: 0.4259; auc: 0.7875 - val_auc: 0.8084                                                                                                    
epoch: 0004, loss: 0.3930 - val_loss: 0.4105; auc: 0.8011 - val_auc: 0.8203                                                                                                    
epoch: 0005, loss: 0.3791 - val_loss: 0.3936; auc: 0.8156 - val_auc: 0.8322                                                                                 

100%|##########| 3121/3121 [02:22<00:00, 21.86it/s]
100%|##########| 3121/3121 [01:37<00:00, 32.00it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4773 - val_loss: 0.4757; auc: 0.6837 - val_auc: 0.6519                                                                                                    
epoch: 0002, loss: 0.4211 - val_loss: 0.4492; auc: 0.7553 - val_auc: 0.7059                                                                                                    
epoch: 0003, loss: 0.4084 - val_loss: 0.4385; auc: 0.7845 - val_auc: 0.7317                                                                                                    
epoch: 0004, loss: 0.3947 - val_loss: 0.4311; auc: 0.7974 - val_auc: 0.7411                                                                                                    
epoch: 0005, loss: 0.3802 - val_loss: 0.4187; auc: 0.8105 - val_auc: 0.7515                                                                                 

100%|##########| 3121/3121 [02:33<00:00, 20.31it/s]
100%|##########| 3121/3121 [01:39<00:00, 31.52it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4768 - val_loss: 0.4704; auc: 0.6912 - val_auc: 0.7158                                                                                                    
epoch: 0002, loss: 0.4218 - val_loss: 0.4446; auc: 0.7528 - val_auc: 0.7588                                                                                                    
epoch: 0003, loss: 0.4109 - val_loss: 0.4322; auc: 0.7800 - val_auc: 0.7776                                                                                                    
epoch: 0004, loss: 0.3974 - val_loss: 0.4250; auc: 0.7918 - val_auc: 0.7843                                                                                                    
epoch: 0005, loss: 0.3858 - val_loss: 0.4058; auc: 0.8144 - val_auc: 0.7987                                                                                 

In [8]:
print("Total number of models trained:", partitions)
print("Mean of AUCs:", round(np.mean(aucs), 3))
print("Variance of AUCs:", round(np.var(aucs), 3))

Total number of models trained: 14
Mean of accuracies: 0.894
Variance of accuracies: 0.0
