## Time benchmarking for every method
Reproduce results, compare algorithms wrt accuracy and training. **No need to train anything!**

In [1]:
import keras, pickle 
import keras.backend as K
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor # this is for making a model like every other in scikit
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from time import time
import matplotlib.pyplot as plt

from keras.models import Model
from keras.models import load_model, model_from_json

np.random.seed(42)
from tensorflow import set_random_seed
set_random_seed(42)

TRAINEDMODELS='ECFP/TrainedModels/'

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Prerequisites

In [2]:
Interactions_train = []    
with open("Interactions_Trainset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_train.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions_valid = []        
with open("Interactions_Validset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_valid.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions = [x for x in Interactions_train]
Interactions.extend(Interactions_valid)
# we use a dataframe to quickly sort targets wrt #compounds:
DF = pd.DataFrame( Interactions, columns =['Target-ID', 'Compound-ID','Std-value']) 
temp = DF.groupby(['Target-ID']).agg('count').sort_values(by='Compound-ID') # count the number of molecules
Targets = list(temp.index)
Compounds = np.unique(DF['Compound-ID'])
del temp, DF

nT=len(Targets); nC=len(Compounds)

print("There are {0} targets and {1} compounds currently loaded with {2} interactions.".format(nT,nC,len(Interactions)))
print("A DTI matrix would be {0:.4}% dense!".format(100.0*len(Interactions)/nT/nC ))

Labels_Targ = dict()
indx=0
for x in Targets:
    Labels_Targ[x]=indx
    indx+=1
    
Labels_Comp = dict()
indx=0
for x in Compounds:
    Labels_Comp[x]=indx
    indx+=1

# Initialize sparse matrix - this will be binary
DTI = 10*np.ones((nC,nT),dtype=float)

for edge in Interactions_train:
    # each edge has "target-compound-value-active"
    DTI[ Labels_Comp[edge[1]], Labels_Targ[edge[0]] ] = edge[2]

values = [x[2] for x in Interactions]
print("New data: {0} | {1}".format(np.mean(values), np.std(values)))

# load fingerprints and prepare as feature vectors
Fingerprints={} # this contains one list per fingerprint - not efficient...
with open('Compound_Fingerprints.tab', 'r') as f:
    header = f.readline()
    for line in f:
        # each line is Comp-ID, SMILES, FP
        tokens = line.split()
        # we keep only those compounds which have FPs
        if tokens[2] != 'NOFP':
            fp = [int(c) for c in tokens[2] ]
            Fingerprints[ tokens[0] ] = fp
print("%d fingerprints were loaded!" % len(Fingerprints))

# split to train/test data
random_seed = 2019
np.random.seed(random_seed)

print("The sizes for train and validation sets are {0} and {1} respectivelly".format( len(Interactions_train), len(Interactions_valid) ))

There are 110 targets and 23167 compounds currently loaded with 56392 interactions.
A DTI matrix would be 2.213% dense!
New data: 4.395417284295644 | 2.588704763200101
23167 fingerprints were loaded!
The sizes for train and validation sets are 45114 and 11278 respectivelly


In [3]:
def MTL(lamda=0.02, wsl=200, whl=20, lr=0.0001):
    inputs = keras.Input(shape=(2048,))
    sharedlayer = keras.layers.Dense(wsl, activation='tanh',kernel_regularizer=regularizers.l2(lamda) )(inputs) 
    myinit = keras.initializers.Constant(4.)
    hidden = []
    for i in range(len(Targets)):
        hl = Dense(units=whl,  activation='tanh', kernel_regularizer=regularizers.l2(lamda) )(sharedlayer)
        hidden.append( Dense(1, kernel_initializer=myinit, activity_regularizer=regularizers.l1(0.0001) )(hl) )

    MTL=Model(inputs=inputs, outputs=hidden)
    MTL.compile(loss=masked_loss_function, optimizer=keras.optimizers.adam(lr=0.0001))
    return MTL

def masked_loss_function(y_true, y_pred, MissingVal=10):
    # This function masks the elements of the vectors with true/predicted values so that the model focuses
    # only on the known data. By default, missing values are represented by 10
    mask = K.cast(K.not_equal(y_true, MissingVal), K.floatx())
    return keras.losses.mean_squared_error(y_true * mask, y_pred * mask)

def MTL_Drop( wsl, whl, drop_rate=0.1, lr=0.001):
    # a function that creates a NN with dropout incorporated after the first hidden layer
    inputs = keras.Input(shape=(2048,))
    sharedlayer = keras.layers.Dense(wsl, activation='tanh' )(inputs) 
    dropout= keras.layers.Dropout(drop_rate)(sharedlayer, training=True)
    myinit = keras.initializers.Constant(4.)
    hidden = []
    for i in range(len(Targets)):
        hl = Dense(units=whl,  activation='tanh', kernel_regularizer=regularizers.l2(0.05) )(dropout)
        hidden.append( Dense(1, kernel_initializer=myinit, activity_regularizer=regularizers.l1(0.0001) )(hl) )

    MTL=Model(inputs=inputs, outputs=hidden)
    MTL.compile(loss=masked_loss_function, optimizer=keras.optimizers.adam(lr=lr), )  #metrics=[masked_r2]
    return MTL

from scipy.stats import sem
from scipy.stats import t as tstat

def mulpredict(model, x_test, Ntargets, N=10, conf_flag=False):
    preds = np.zeros( (N, Ntargets) )
    for i in range(N):
        preds[i,:] = [ x[0][0] for x in model.predict( x_test ) ]
    if conf_flag:
        std_err = sem(preds, axis=0)
        h = std_err * tstat.ppf((1 + 0.95) / 2, len(preds) - 1)
        return np.mean(preds, axis=0), h
        # we need the column-wise average of this matrix
    else:
        return np.mean(preds, axis=0) 

def Evaluate(Inter_list, Comp_list, Model, Fingerprints, Ntar=110, Niter=10):
    Predictions = []
    Percomp = {} # contains dicts with lists: (target: [true, pred_NN] )
    for test_case in Comp_list:
        Percomp[ test_case ] = {}
        for tokens in Inter_list:
            if tokens[1]==test_case:
                Percomp[test_case][ tokens[0] ] = [ tokens[2] ]
        if len(Percomp[ test_case ])>0:
            # we've got some values for this compound, now produce predictions:
            preds = mulpredict(Model, np.array( Fingerprints[test_case]).reshape(1,-1), Ntar, Niter)
            
            for target in Percomp[test_case]: 
                Percomp[test_case][target].append( preds[Labels_Targ[target]])
                Predictions.append( [target,test_case, Percomp[test_case][target][0], Percomp[test_case][target][1] ])

        if len(Predictions) % 1000 == 0:
            r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
            print(f"\rMore than ", len(Predictions)," pairs have been parsed. Mean performance so far =",r2, end=" ")
    print(" ")
    r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
    print("Performance for MTL-D NN = %f" % r2)
    return Predictions

# Initialize sparse matrix - this will be binary
DTI = 10*np.ones((nC,nT),dtype=float)

for edge in Interactions_train:
    # each edge has "target-compound-value-active"
    DTI[ Labels_Comp[edge[1]], Labels_Targ[edge[0]] ] = edge[2]
DTI.shape
Compounds_to_test = set([x[1] for x in Interactions_valid])

### Multi-Task NN

In [4]:
# best_params = {'wsl': 300, 'whl': 30, 'lamda': 0.1}
# Nepochs = 40; BS=64; lr=0.0001
# wsl=best_params['wsl']; whl=best_params['whl']; lamda=best_params['lamda']
# print(wsl,whl,lamda,Nepochs,BS,lr)

# t0=time()
# MTLR = MTL(wsl=wsl, whl=whl, lamda=lamda,lr=lr )
# MTLR.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=Nepochs, batch_size=BS, verbose=0, use_multiprocessing=True )
# print("Training length with {0} epochs and BS={1} is {2}".format(Nepochs, BS, time()-t0))

MTLR = load_model('MTL-200-30-0.1-model.h5', compile=False) # custom_objects={'masked_loss_function':masked_loss_function})

t0=time()
Predictions_MTL = Evaluate( Interactions_valid, Compounds_to_test, MTLR, Fingerprints, Niter=1)
tref=1000*(time()-t0)/len(Compounds_to_test)/len(Targets)
print("Duration per 1000 predictions = ",tref )

print("R2 for MTL NN = %f" % r2_score([x[2] for x in Predictions_MTL], [x[3] for x in Predictions_MTL]) )
print("MSE for MTL NN = %f" %     MSE([x[2] for x in Predictions_MTL], [x[3] for x in Predictions_MTL]) )






More than  11000  pairs have been parsed. Mean performance so far = 0.6299385706888115  
Performance for MTL-D NN = 0.630451
Duration per 1000 predictions =  0.025042821405267358
R2 for MTL NN = 0.630451
MSE for MTL NN = 2.446853


### Multi-Task with drop-out

In [5]:
# best_params = {'wsl': 200, 'whl': 20, 'drop-rate': 0.05}
# Nepochs = 40; BS=64; lr=0.0001
# wsl=best_params['wsl']; whl=best_params['whl']; drop_rate=best_params['drop-rate']
# print(wsl,whl,lamda,Nepochs,BS,lr)

# t0=time()
# MTLD = MTL_Drop(wsl=wsl, whl=whl, drop_rate=drop_rate,lr=lr )
# MTLD.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=Nepochs, batch_size=BS, verbose=0, use_multiprocessing=True )
# print("Training length with {0} epochs and BS={1} is {2}".format(Nepochs, BS, time()-t0))

# MTLD = load_model('MTLD.200.20.005.model.h5', compile=False) # custom_objects={'masked_loss_function':masked_loss_function})
MTLD = load_model('MTLD-200-20-0.1-model.h5', compile=False) # custom_objects={'masked_loss_function':masked_loss_function})

t0=time()
Compounds_to_test = set([x[1] for x in Interactions_valid])
Predictions_MTD = Evaluate( Interactions_valid, Compounds_to_test, MTLD, Fingerprints, Niter=5)
temp = 1000*(time()-t0)/len(Compounds_to_test)/len(Targets)
print("Duration per 1000 predictions = ", temp)
print("Relative duration per 1000prd = ", temp/tref )

print("R2 for MTLD NN = %f" % r2_score([x[2] for x in Predictions_MTD], [x[3] for x in Predictions_MTD]) )
print("MSE for MTLD NN = %f" %     MSE([x[2] for x in Predictions_MTD], [x[3] for x in Predictions_MTD]) )

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
More than  11000  pairs have been parsed. Mean performance so far = 0.6220269253385564  
Performance for MTL-D NN = 0.622394
Duration per 1000 predictions =  0.08990162024360562
Relative duration per 1000prd =  3.589915800170034
R2 for MTLD NN = 0.622394
MSE for MTLD NN = 2.500201


### Random Forests

In [6]:
RF_all = {}; Y_pred = []; Y_true = []
Pertarget_RF = dict() # contains lists with tuples: (true, pred_RF, pred_NN)
Predictions_RF = [] # this will contain items as [target, compound, true, prediction]

t0=time()
for target in Targets:
    # load pre-trained models:
    with open( TRAINEDMODELS+'RF_'+target+'_'+'pIC50new.sav', 'rb') as f:
        RFR = pickle.load( f )
    RF_all[target] = RFR        
print("Models are ready. Duration so far =",time()-t0)

t0=time()
for tokens in Interactions_valid:
    # 'Target-ID', 'Compound-ID', 'Std-value'  
    if tokens[1] in Fingerprints:
        Y_true.append( tokens[2] )
        x_test = np.array( Fingerprints[tokens[1]] ).reshape(1,-1) # prepare for prediction

        model = RF_all[tokens[0]]
        Y_pred.append( model.predict( x_test ) )
        Predictions_RF.append( [tokens[0], tokens[1], tokens[2], Y_pred[-1][0]] )

    if tokens[0] in Pertarget_RF:
        Pertarget_RF[tokens[0]].append( (Y_true[-1], Y_pred[-1][0])  )
    else:
        # first time for this protein
        Pertarget_RF[tokens[0]] = [ (Y_true[-1], Y_pred[-1][0]) ]

temp = 1000*(time()-t0)/len(Predictions_RF)
print("Duration per 1000 predictions = ", temp )
print("Relative duration per 1000prd = ", temp/tref )

print("R2 for RF NN = %f" % r2_score([x[2] for x in Predictions_RF], [x[3] for x in Predictions_RF]) )
print("MSE for RF NN = %f" %     MSE([x[2] for x in Predictions_RF], [x[3] for x in Predictions_RF]) )

Models are ready. Duration so far = 2.3746955394744873
Duration per 1000 predictions =  3.636970907307026
Relative duration per 1000prd =  145.23007805111158
R2 for RF NN = 0.647436
MSE for RF NN = 2.334393


### Lasso Regression


In [7]:
from sklearn.linear_model import Lasso

LR_all = {}; Y_pred = []; Y_true = []
Pertarget_LR = dict() # contains lists with tuples: (true, pred_RF, pred_NN)
Predictions_LR = [] # this will contain items as [target, compound, true, prediction]

t0=time()
for target in Targets:
    # load pre-trained models:
    with open( TRAINEDMODELS+'LR_'+target+'_'+'pIC50new.sav', 'rb') as f:
        LR = pickle.load( f )
    LR_all[target] = LR        
print("Models are ready. Duration so far =",time()-t0)

t0=time()
for tokens in Interactions_valid:
    # 'Target-ID', 'Compound-ID', 'Std-value'  
    if tokens[1] in Fingerprints:
        Y_true.append( tokens[2] )
        x_test = np.array( Fingerprints[tokens[1]] ).reshape(1,-1) # prepare for prediction

        model = LR_all[tokens[0]]
        Y_pred.append( model.predict( x_test ) )
        Predictions_LR.append( [tokens[0], tokens[1], tokens[2], Y_pred[-1][0]] )

    if tokens[0] in Pertarget_LR:
        Pertarget_LR[tokens[0]].append( (Y_true[-1], Y_pred[-1][0])  )
    else:
        # first time for this protein
        Pertarget_LR[tokens[0]] = [ (Y_true[-1], Y_pred[-1][0]) ]

temp = 1000*(time()-t0)/len(Predictions_LR)
print("Duration per 1000 predictions = ", temp )
print("Relative duration per 1000prd = ", temp/tref )

print("R2 for LR NN = %f" % r2_score([x[2] for x in Predictions_LR], [x[3] for x in Predictions_LR]) )
print("MSE for LR NN = %f" %     MSE([x[2] for x in Predictions_LR], [x[3] for x in Predictions_LR]) )

Models are ready. Duration so far = 0.08431863784790039
Duration per 1000 predictions =  0.11387647604092488
Relative duration per 1000prd =  4.547270221596229
R2 for LR NN = 0.591053
MSE for LR NN = 2.707715


### Scikit Neural Networks

In [8]:
from sklearn.neural_network import MLPRegressor

MLPR_all = {}; Y_pred = []; Y_true = []
Pertarget_MLPR = dict() # contains lists with tuples: (true, pred_RF, pred_NN)
Predictions_MLPR = [] # this will contain items as [target, compound, true, prediction]

t0=time()
for target in Targets:
    with open( TRAINEDMODELS+'NN_'+target+'_'+'pIC50new.sav', 'rb') as f:
        MLPR = pickle.load( f )
    MLPR_all[target] = MLPR        
print("Models are ready. Duration so far =",time()-t0)

t0=time()
for tokens in Interactions_valid:
    # 'Target-ID', 'Compound-ID', 'Std-value'  
    if tokens[1] in Fingerprints:
        Y_true.append( tokens[2] )
        x_test = np.array( Fingerprints[tokens[1]] ).reshape(1,-1) # prepare for prediction

        model = MLPR_all[tokens[0]]
        Y_pred.append( model.predict( x_test ) )
        Predictions_MLPR.append( [tokens[0], tokens[1], tokens[2], Y_pred[-1][0]] )

    if tokens[0] in Pertarget_MLPR:
        Pertarget_MLPR[tokens[0]].append( (Y_true[-1], Y_pred[-1][0])  )
    else:
        # first time for this protein
        Pertarget_MLPR[tokens[0]] = [ (Y_true[-1], Y_pred[-1][0]) ]
        
temp = 1000*(time()-t0)/len(Predictions_MLPR)
print("Duration per 1000 predictions = ", temp )
print("Relative duration per 1000prd = ", temp/tref )

print("R2 for skNN = %f" % r2_score([x[2] for x in Predictions_MLPR], [x[3] for x in Predictions_MLPR]) )
print("MSE for skNN = %f" %     MSE([x[2] for x in Predictions_MLPR], [x[3] for x in Predictions_MLPR]) )

Models are ready. Duration so far = 0.20749902725219727
Duration per 1000 predictions =  0.7278384585650303
Relative duration per 1000prd =  29.06375630710448
R2 for skNN = 0.578135
MSE for skNN = 2.793246


In [9]:
myNN_all = {}; Y_pred = []; Y_true = []
Pertarget_myNN = dict() # contains lists with tuples: (true, pred_RF, pred_NN)
Predictions_myNN = [] # this will contain items as [target, compound, true, prediction]

t0=time()
for target in Targets:
    # load pre-trained models:
    with open( TRAINEDMODELS+'Keras/Keras_'+target+'_'+'pIC50model.json', 'r') as jsonf:
        json_model = jsonf.read()
    myNN = model_from_json(json_model)
    myNN.load_weights( TRAINEDMODELS+'Keras/Keras_'+target+'_'+'weights.h5'  )
#     myNN.compile(loss='mean_squared_error', optimizer=keras.optimizers.adam(lr=0.001))
    myNN_all[target] = myNN        
print("Models are ready. Duration so far =",time()-t0)

t0=time()
for tokens in Interactions_valid:
    # 'Target-ID', 'Compound-ID', 'Std-value'  
    if tokens[1] in Fingerprints:
        Y_true.append( tokens[2] )
        x_test = np.array( Fingerprints[tokens[1]] ).reshape(1,-1) # prepare for prediction

        model = myNN_all[tokens[0]]
        Y_pred.append( model.predict( x_test ) )
        Predictions_myNN.append( [tokens[0], tokens[1], tokens[2], Y_pred[-1][0]] )

    if tokens[0] in Pertarget_myNN:
        Pertarget_myNN[tokens[0]].append( (Y_true[-1], Y_pred[-1][0])  )
    else:
        # first time for this protein
        Pertarget_myNN[tokens[0]] = [ (Y_true[-1], Y_pred[-1][0]) ]

temp = 1000*(time()-t0)/len(Predictions_myNN)
print("Duration per 1000 predictions = ", temp )
print("Relative duration per 1000prd = ", temp/tref )

print("R2 for myNN = %f" % r2_score([x[2] for x in Predictions_myNN], [x[3] for x in Predictions_myNN]) )
print("MSE for myNN = %f" %     MSE([x[2] for x in Predictions_myNN], [x[3] for x in Predictions_myNN]) )

Models are ready. Duration so far = 85.47444200515747
Duration per 1000 predictions =  4.778290158532405
Relative duration per 1000prd =  190.8047851799705
R2 for myNN = 0.608141
MSE for myNN = 2.594569
