# MolMap ensemble (dual-path approach)

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]='0'

import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

### Load data

In [2]:
'Set the file name of your data'
data_name = "CYP450.csv.gz"

In [3]:
import numpy as np
import pandas as pd

data = pd.read_csv("../../../data/" + data_name, compression='gzip')

Set up (adapt according to your data)

In [4]:
# keep desired columns
data = data[['smiles', 'label_2c9']]

# drop molecules with NaN activity
data = data.dropna(subset = ["label_2c9"])

# set SMILES
smi = data['smiles'].tolist()

# set Y
Y = pd.get_dummies(data['label_2c9']).values

# number of active and inactive molecules
print("Inactive (0):", Y[:,1].tolist().count(0))
print("Active (1):", Y[:,1].tolist().count(1))

Inactive (0): 7429
Active (1): 2621


### MolMap Ensemble

In [5]:
'Set size of class 0 (minority class) for each model'
size_0 = 500

partitions = int(np.floor(Y[:,1].tolist().count(0)/size_0))
print("Total number of models:", partitions)

Total number of models: 14


In [6]:
'Get classes 0 and 1 in two different variables'
class_0 = data[data["label_2c9"] == 0.0]
class_1 = data[data["label_2c9"] == 1.0]

# shuffle class 0 before dividing it into parts
class_0 = class_0.sample(frac=1).reset_index(drop=True)

In [7]:
from molmap import MolMap
from molmap import feature
from molmap.model import MultiClassEstimator
import sys
sys.path.append("../../../src")
from utils import Rdsplit

aucs = []
# for each model:
for i in range (0, partitions):
    
    # subset of data for model i
    subdata = pd.concat([class_0[i*size_0:(i+1)*size_0], class_1])
    
    # get smiles
    smi = subdata['smiles']
    
    # set Y
    Y = pd.get_dummies(subdata['label_2c9']).values
    
    # compute MolDs
    mp1 = MolMap(ftype='descriptor', metric='cosine',)
    mp1.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15)
    
    # compute FFs
    bitsinfo = feature.fingerprint.Extraction().bitsinfo
    flist = bitsinfo[bitsinfo.Subtypes.isin(['PubChemFP', 'MACCSFP', 'PharmacoErGFP'])].IDs.tolist()
    mp2 = MolMap(ftype = 'fingerprint', fmap_type = 'scatter', flist = flist) 
    mp2.fit(method = 'umap',  min_dist = 0.1, n_neighbors = 15, verbose = 0)
    
    # get Fmaps
    X1 = mp1.batch_transform(smi)
    X2 = mp2.batch_transform(smi)
    
    # split train, val, test
    train_idx, valid_idx, test_idx = Rdsplit(subdata, random_state = 888)
    trainX = (X1[train_idx], X2[train_idx])
    validX = (X1[valid_idx], X2[valid_idx])
    testX = (X1[test_idx], X2[test_idx])
    trainY = Y[train_idx]
    validY = Y[valid_idx]
    testY = Y[test_idx]
    
    # model 
    clf = MultiClassEstimator(n_outputs=trainY.shape[1], 
                              fmap_shape1 = X1.shape[1:],
                              fmap_shape2 = X2.shape[1:],
                              metric='ROC', 
                              dense_layers = [128, 64],  gpuid = 0, epochs = 100,
                              ) 
    # fit model
    clf.fit(trainX, trainY, validX, validY)
    
    # score 
    auc = clf.score(testX, testY) 
    aucs.append(auc)

2022-06-30 14:26:02,743 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2022-06-30 14:26:04,879 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m
2022-06-30 14:26:10,859 - [32mINFO[0m - [bidd-molmap][0m - Applying naive scatter feature map...[0m
2022-06-30 14:26:10,871 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 3121/3121 [02:36<00:00, 19.93it/s]
100%|##########| 3121/3121 [01:34<00:00, 33.02it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4769 - val_loss: 0.4684; auc: 0.6840 - val_auc: 0.7285                                                                                                    
epoch: 0002, loss: 0.4227 - val_loss: 0.4413; auc: 0.7464 - val_auc: 0.7759                                                                                                    
epoch: 0003, loss: 0.4115 - val_loss: 0.4272; auc: 0.7712 - val_auc: 0.7964                                                                                                    
epoch: 0004, loss: 0.3992 - val_loss: 0.4110; auc: 0.7815 - val_auc: 0.8047                                                                                                    
epoch: 0005, loss: 0.3863 - val_loss: 0.3939; auc: 0.7966 - val_auc: 0.8171                                                                                 

100%|##########| 3121/3121 [02:20<00:00, 22.17it/s]
100%|##########| 3121/3121 [01:49<00:00, 28.53it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4765 - val_loss: 0.4737; auc: 0.6956 - val_auc: 0.6901                                                                                                    
epoch: 0002, loss: 0.4202 - val_loss: 0.4455; auc: 0.7667 - val_auc: 0.7600                                                                                                    
epoch: 0003, loss: 0.4074 - val_loss: 0.4314; auc: 0.7903 - val_auc: 0.7840                                                                                                    
epoch: 0004, loss: 0.3921 - val_loss: 0.4204; auc: 0.8027 - val_auc: 0.7937                                                                                                    
epoch: 0005, loss: 0.3763 - val_loss: 0.3980; auc: 0.8181 - val_auc: 0.8084                                                                                 

100%|##########| 3121/3121 [02:45<00:00, 18.90it/s]
100%|##########| 3121/3121 [02:14<00:00, 23.15it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4771 - val_loss: 0.4737; auc: 0.6712 - val_auc: 0.6748                                                                                                    
epoch: 0002, loss: 0.4229 - val_loss: 0.4485; auc: 0.7399 - val_auc: 0.7331                                                                                                    
epoch: 0003, loss: 0.4119 - val_loss: 0.4368; auc: 0.7690 - val_auc: 0.7658                                                                                                    
epoch: 0004, loss: 0.3996 - val_loss: 0.4240; auc: 0.7812 - val_auc: 0.7797                                                                                                    
epoch: 0005, loss: 0.3847 - val_loss: 0.4084; auc: 0.7949 - val_auc: 0.7908                                                                                 

100%|##########| 3121/3121 [02:33<00:00, 20.40it/s]
100%|##########| 3121/3121 [01:49<00:00, 28.56it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4774 - val_loss: 0.4725; auc: 0.7007 - val_auc: 0.6710                                                                                                    
epoch: 0002, loss: 0.4197 - val_loss: 0.4473; auc: 0.7756 - val_auc: 0.7241                                                                                                    
epoch: 0003, loss: 0.4057 - val_loss: 0.4324; auc: 0.7993 - val_auc: 0.7451                                                                                                    
epoch: 0004, loss: 0.3886 - val_loss: 0.4263; auc: 0.8118 - val_auc: 0.7538                                                                                                    
epoch: 0005, loss: 0.3731 - val_loss: 0.4035; auc: 0.8287 - val_auc: 0.7711                                                                                 

100%|##########| 3121/3121 [02:25<00:00, 21.43it/s]
100%|##########| 3121/3121 [01:34<00:00, 33.09it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4778 - val_loss: 0.4730; auc: 0.6867 - val_auc: 0.6911                                                                                                    
epoch: 0002, loss: 0.4223 - val_loss: 0.4442; auc: 0.7607 - val_auc: 0.7691                                                                                                    
epoch: 0003, loss: 0.4099 - val_loss: 0.4292; auc: 0.7874 - val_auc: 0.7982                                                                                                    
epoch: 0004, loss: 0.3950 - val_loss: 0.4145; auc: 0.8012 - val_auc: 0.8130                                                                                                    
epoch: 0005, loss: 0.3793 - val_loss: 0.3892; auc: 0.8194 - val_auc: 0.8277                                                                                 

100%|##########| 3121/3121 [02:14<00:00, 23.13it/s]
100%|##########| 3121/3121 [01:33<00:00, 33.38it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4771 - val_loss: 0.4671; auc: 0.6862 - val_auc: 0.7446                                                                                                    
epoch: 0002, loss: 0.4226 - val_loss: 0.4396; auc: 0.7506 - val_auc: 0.8076                                                                                                    
epoch: 0003, loss: 0.4115 - val_loss: 0.4220; auc: 0.7838 - val_auc: 0.8420                                                                                                    
epoch: 0004, loss: 0.3987 - val_loss: 0.4024; auc: 0.7984 - val_auc: 0.8567                                                                                                    
epoch: 0005, loss: 0.3858 - val_loss: 0.3770; auc: 0.8143 - val_auc: 0.8698                                                                                 

100%|##########| 3121/3121 [02:14<00:00, 23.13it/s]
100%|##########| 3121/3121 [01:33<00:00, 33.39it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4775 - val_loss: 0.4743; auc: 0.6882 - val_auc: 0.6484                                                                                                    
epoch: 0002, loss: 0.4213 - val_loss: 0.4497; auc: 0.7535 - val_auc: 0.7071                                                                                                    
epoch: 0003, loss: 0.4098 - val_loss: 0.4380; auc: 0.7802 - val_auc: 0.7376                                                                                                    
epoch: 0004, loss: 0.3966 - val_loss: 0.4303; auc: 0.7912 - val_auc: 0.7503                                                                                                    
epoch: 0005, loss: 0.3829 - val_loss: 0.4131; auc: 0.8099 - val_auc: 0.7659                                                                                 

100%|##########| 3121/3121 [02:14<00:00, 23.21it/s]
100%|##########| 3121/3121 [01:33<00:00, 33.54it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4766 - val_loss: 0.4691; auc: 0.6874 - val_auc: 0.7256                                                                                                    
epoch: 0002, loss: 0.4210 - val_loss: 0.4431; auc: 0.7582 - val_auc: 0.7728                                                                                                    
epoch: 0003, loss: 0.4084 - val_loss: 0.4300; auc: 0.7850 - val_auc: 0.7858                                                                                                    
epoch: 0004, loss: 0.3946 - val_loss: 0.4187; auc: 0.7964 - val_auc: 0.7909                                                                                                    
epoch: 0005, loss: 0.3786 - val_loss: 0.4007; auc: 0.8119 - val_auc: 0.7966                                                                                 

100%|##########| 3121/3121 [02:14<00:00, 23.15it/s]
100%|##########| 3121/3121 [01:33<00:00, 33.27it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4766 - val_loss: 0.4750; auc: 0.6898 - val_auc: 0.6500                                                                                                    
epoch: 0002, loss: 0.4223 - val_loss: 0.4473; auc: 0.7531 - val_auc: 0.7323                                                                                                    
epoch: 0003, loss: 0.4114 - val_loss: 0.4341; auc: 0.7725 - val_auc: 0.7643                                                                                                    
epoch: 0004, loss: 0.3972 - val_loss: 0.4201; auc: 0.7850 - val_auc: 0.7919                                                                                                    
epoch: 0005, loss: 0.3825 - val_loss: 0.3930; auc: 0.8008 - val_auc: 0.8147                                                                                 

100%|##########| 3121/3121 [02:15<00:00, 23.11it/s]
100%|##########| 3121/3121 [01:33<00:00, 33.24it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4783 - val_loss: 0.4718; auc: 0.6588 - val_auc: 0.6885                                                                                                    
epoch: 0002, loss: 0.4238 - val_loss: 0.4469; auc: 0.7320 - val_auc: 0.7452                                                                                                    
epoch: 0003, loss: 0.4141 - val_loss: 0.4345; auc: 0.7644 - val_auc: 0.7743                                                                                                    
epoch: 0004, loss: 0.4036 - val_loss: 0.4227; auc: 0.7813 - val_auc: 0.7863                                                                                                    
epoch: 0005, loss: 0.3936 - val_loss: 0.4104; auc: 0.7953 - val_auc: 0.7987                                                                                 

100%|##########| 3121/3121 [02:15<00:00, 23.10it/s]
100%|##########| 3121/3121 [01:33<00:00, 33.47it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4783 - val_loss: 0.4679; auc: 0.6750 - val_auc: 0.7367                                                                                                    
epoch: 0002, loss: 0.4237 - val_loss: 0.4418; auc: 0.7473 - val_auc: 0.8021                                                                                                    
epoch: 0003, loss: 0.4134 - val_loss: 0.4276; auc: 0.7799 - val_auc: 0.8322                                                                                                    
epoch: 0004, loss: 0.4011 - val_loss: 0.4126; auc: 0.7952 - val_auc: 0.8465                                                                                                    
epoch: 0005, loss: 0.3889 - val_loss: 0.3891; auc: 0.8124 - val_auc: 0.8577                                                                                 

100%|##########| 3121/3121 [02:14<00:00, 23.19it/s]
100%|##########| 3121/3121 [01:33<00:00, 33.31it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4756 - val_loss: 0.4703; auc: 0.6931 - val_auc: 0.7091                                                                                                    
epoch: 0002, loss: 0.4193 - val_loss: 0.4425; auc: 0.7603 - val_auc: 0.7598                                                                                                    
epoch: 0003, loss: 0.4063 - val_loss: 0.4286; auc: 0.7885 - val_auc: 0.7834                                                                                                    
epoch: 0004, loss: 0.3906 - val_loss: 0.4148; auc: 0.8026 - val_auc: 0.7944                                                                                                    
epoch: 0005, loss: 0.3760 - val_loss: 0.3900; auc: 0.8227 - val_auc: 0.8102                                                                                 

100%|##########| 3121/3121 [02:14<00:00, 23.17it/s]
100%|##########| 3121/3121 [01:33<00:00, 33.26it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4770 - val_loss: 0.4706; auc: 0.6949 - val_auc: 0.6996                                                                                                    
epoch: 0002, loss: 0.4225 - val_loss: 0.4439; auc: 0.7641 - val_auc: 0.7644                                                                                                    
epoch: 0003, loss: 0.4111 - val_loss: 0.4300; auc: 0.7891 - val_auc: 0.7920                                                                                                    
epoch: 0004, loss: 0.3982 - val_loss: 0.4154; auc: 0.7999 - val_auc: 0.8067                                                                                                    
epoch: 0005, loss: 0.3852 - val_loss: 0.3934; auc: 0.8135 - val_auc: 0.8229                                                                                 

100%|##########| 3121/3121 [02:14<00:00, 23.23it/s]
100%|##########| 3121/3121 [01:33<00:00, 33.34it/s]


2497 312 312
MultiClassEstimator(epochs=100, fmap_shape1=(37, 37, 13),
                    fmap_shape2=(72, 72, 3), gpuid='0', n_outputs=2)
epoch: 0001, loss: 0.4769 - val_loss: 0.4730; auc: 0.6967 - val_auc: 0.6878                                                                                                    
epoch: 0002, loss: 0.4212 - val_loss: 0.4439; auc: 0.7642 - val_auc: 0.7666                                                                                                    
epoch: 0003, loss: 0.4093 - val_loss: 0.4285; auc: 0.7871 - val_auc: 0.7940                                                                                                    
epoch: 0004, loss: 0.3946 - val_loss: 0.4163; auc: 0.8023 - val_auc: 0.8092                                                                                                    
epoch: 0005, loss: 0.3818 - val_loss: 0.3969; auc: 0.8183 - val_auc: 0.8271                                                                                 

In [8]:
print("Total number of models trained:", partitions)
print("Mean of AUCs:", round(np.mean(aucs), 3))
print("Variance of AUCs:", np.var(aucs))

Total number of models trained: 14
Mean of AUCs: 0.89
Variance of AUCs: 0.00023931961565818526
