## Prerequisites

In [22]:
import numpy as np
import pandas as pd
import keras
import os

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor # this is for making a model like every other in scikit
# from sklearn.decomposition import TruncatedSVD as tSVD

import  matplotlib.pyplot as plt
random_seed = 2019
np.random.seed(random_seed)
nfolds=4
njobs =5
# RESCALE_FACTOR = 10.0 # 0-10000 -> 0-1000 # no need for that, we use -logIC50 instead.

### Load data file from disk 

In [23]:
Interactions_train = []    
with open("Interactions_Trainset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_train.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions_valid = []        
with open("Interactions_Validset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_valid.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions = [x for x in Interactions_train]
Interactions.extend(Interactions_valid)
# we use a dataframe to quickly sort targets wrt #compounds:
DF = pd.DataFrame( Interactions, columns =['Target-ID', 'Compound-ID','Std-value']) 
temp = DF.groupby(['Target-ID']).agg('count').sort_values(by='Compound-ID') # count the number of molecules
Targets = list(temp.index)
Compounds = np.unique(DF['Compound-ID'])
del temp, DF

nT=len(Targets); nC=len(Compounds)

print("There are {0} targets and {1} compounds currently loaded with {2} interactions.".format(nT,nC,len(Interactions)))
print("A DTI matrix would be {0:.4}% dense!".format(100.0*len(Interactions)/nT/nC ))

# first we need to prepare each fp as a feature vector
Fingerprints={} # this contains one list per fingerprint - not efficient...
with open('Compound_Fingerprints.tab', 'r') as f:
    header = f.readline()
    for line in f:
        # each line is Comp-ID, SMILES, FP
        tokens = line.split()
        # we keep only those compounds which have FPs
        if tokens[2] != 'NOFP':
            fp = [int(c) for c in tokens[2] ]
            Fingerprints[ tokens[0] ] = fp
print("%d fingerprints were loaded!" % len(Fingerprints))

# data standardisation - no need after using pIC50 !
values = [x[2] for x in Interactions]
print("Stats for values : {0} | {1}".format(np.mean(values), np.std(values)))

There are 110 targets and 23167 compounds currently loaded with 56392 interactions.
A DTI matrix would be 2.213% dense!
23167 fingerprints were loaded!
Stats for values : -4.604582905766776 | 2.5887050795505413


## MTL with Dropout

In [16]:
from scipy.stats import sem
from scipy.stats import t as tstat
from keras import Model
import keras
import keras.backend as K
from time import time

def mulpredict(model, x_test, Ntargets, N=10, conf_flag=False):
    preds = np.zeros( (N, Ntargets) )
    for i in range(N):
        preds[i,:] = [ x[0][0] for x in model.predict( x_test ) ]
    # we need the column-wise average of this matrix
    if conf_flag:
        std_err = sem(preds, axis=0)
        h = std_err * tstat.ppf((1 + 0.95) / 2, len(preds) - 1)
        return np.mean(preds, axis=0), h
    else:
        return np.mean(preds, axis=0) 
    
def Evaluate(Inter_list, Comp_list, Model, Fingerprints, Ntar=110, Niter=10):
    
    Predictions = []
    Percomp = {} # contains dicts with lists: (target: [true, pred_NN] )
    for test_case in Comp_list:
        Percomp[ test_case ] = {}
        for tokens in Inter_list:
            if tokens[1]==test_case:
                # CID-TID -> [true_val]
                Percomp[test_case][ tokens[0] ] = [ tokens[2] ]
        if len(Percomp[ test_case ])>0:
            # we've got some values for this compound, now produce predictions:
            preds = mulpredict(Model, np.array( Fingerprints[test_case]).reshape(1,-1), Ntar, Niter)
            
            for target in Percomp[test_case]: 
                Percomp[test_case][target].append( preds[Labels_Targ[target]])
                Predictions.append( [target, test_case, Percomp[test_case][target][0], Percomp[test_case][target][1] ])

        if len(Predictions) % 1000 == 0:
            r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
            print(f"\rMore than ", len(Predictions)," predictions have been parsed. Mean performance so far =",r2, end=" ")
#            print(f"\rMore than", list(Compounds).index(test_case) ,"compounds have been parsed. Mean score =", str(r2_score(Y_true, Y_MTLD))[:7] , end =" ")
    print(" ")
    r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
    print("Performance for MTL-D NN = %f" % r2)
    return Predictions

def masked_loss_function(y_true, y_pred):
    mask = K.cast(K.not_equal(y_true, 10), K.floatx())
    return keras.losses.mean_squared_error(y_true * mask, y_pred * mask)

def MTL_Drop( wsl, whl, drop_rate=0.1, lr=0.001):
    inputs = keras.Input(shape=(2048,))
    sharedlayer = keras.layers.Dense(wsl, activation='tanh' )(inputs) 
    dropout= keras.layers.Dropout(drop_rate)(sharedlayer, training=True)
    myinit = keras.initializers.Constant(-4.)
    hidden = []
    for i in range(len(Targets)):
        hl = Dense(units=whl,  activation='tanh', kernel_regularizer=regularizers.l2(0.02) )(dropout)
        hidden.append( Dense(1, kernel_initializer=myinit, activity_regularizer=regularizers.l1(0.001) )(hl) )

    MTL=Model(inputs=inputs, outputs=hidden)
    MTL.compile(loss=masked_loss_function, optimizer=keras.optimizers.adam(lr=lr))
    return MTL

Labels_Targ = dict()
indx=0
for x in Targets:
    Labels_Targ[x]=indx
    indx+=1
    
Labels_Comp = dict()
indx=0
for x in Compounds:
    Labels_Comp[x]=indx
    indx+=1

# Initialize sparse matrix - this will be binary
DTI = 10*np.ones((nC,nT),dtype=float)

for edge in Interactions_train:
    # each edge has "target-compound-value-active"
    DTI[ Labels_Comp[edge[1]], Labels_Targ[edge[0]] ] = edge[2]
DTI.shape

(23167, 110)

In [24]:
# we assume that CV for model selection has already been performed!
MTLD = MTL_Drop(2000,20,0.2,0.001)
t0=time()
MTLD.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=30, batch_size=128, verbose=0, use_multiprocessing=True )
print("Duration for fitting = ", time()-t0)
temp = Evaluate( Interactions_valid, Compounds, MTLD, Fingerprints)

theta=0.10

train_size = len(Interactions_train)
count = 1 # just a trigger for the next loop
while (train_size<0.5e6) & (count>0):
# we need to stop after we have no new predictions or we have enough (10K+)
    count=0
    for x_new in Compounds:
        preds, H = mulpredict(MTLD, np.array( Fingerprints[x_new]).reshape(1,-1), 110, 20, True)
        # impute accordingly
        for t in range(110):
            if (H[t]< theta) & (DTI[Labels_Comp[x_new],t] == 10):
                DTI[Labels_Comp[x_new],t] = preds[t] # update the train set
                count+=1
        if list(Compounds).index(x_new) % 100 == 0:
            print(f"\rMore than", list(Compounds).index(x_new) ,"compounds have been parsed with ",count,"new values.", end =" ")
    print(count," new values where imputed.")
    train_size += count
    
    if count >0 :
        t0 = time()
        MTLD.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=30, batch_size=128, verbose=0, use_multiprocessing=True )
        print("Duration for fitting = ", time()-t0)
        temp = Evaluate( Interactions_valid, Compounds, MTLD, Fingerprints)
    # repeat if enough points were predicted confidently

Epoch 1/30
 - 69s - loss: 225.0014 - dense_1108_loss: 0.3735 - dense_1110_loss: 0.2950 - dense_1112_loss: 0.4373 - dense_1114_loss: 0.5223 - dense_1116_loss: 0.3310 - dense_1118_loss: 0.2654 - dense_1120_loss: 0.4485 - dense_1122_loss: 0.4884 - dense_1124_loss: 0.4945 - dense_1126_loss: 0.4334 - dense_1128_loss: 0.7410 - dense_1130_loss: 0.4749 - dense_1132_loss: 0.6846 - dense_1134_loss: 0.4410 - dense_1136_loss: 0.4587 - dense_1138_loss: 0.4664 - dense_1140_loss: 0.6122 - dense_1142_loss: 0.4449 - dense_1144_loss: 0.4940 - dense_1146_loss: 0.5934 - dense_1148_loss: 0.4092 - dense_1150_loss: 0.5820 - dense_1152_loss: 0.4186 - dense_1154_loss: 0.5210 - dense_1156_loss: 0.3573 - dense_1158_loss: 0.2222 - dense_1160_loss: 0.5185 - dense_1162_loss: 0.3766 - dense_1164_loss: 0.3362 - dense_1166_loss: 0.4548 - dense_1168_loss: 0.4720 - dense_1170_loss: 0.6702 - dense_1172_loss: 0.6866 - dense_1174_loss: 0.4868 - dense_1176_loss: 0.5597 - dense_1178_loss: 0.9479 - dense_1180_loss: 0.8491 - d

Epoch 4/30
 - 18s - loss: 62.7786 - dense_1108_loss: 0.0152 - dense_1110_loss: 0.0146 - dense_1112_loss: 0.0183 - dense_1114_loss: 0.0090 - dense_1116_loss: 0.0175 - dense_1118_loss: 0.0139 - dense_1120_loss: 0.0297 - dense_1122_loss: 0.0162 - dense_1124_loss: 0.0364 - dense_1126_loss: 0.0213 - dense_1128_loss: 0.0138 - dense_1130_loss: 0.0300 - dense_1132_loss: 0.0213 - dense_1134_loss: 0.0421 - dense_1136_loss: 0.0301 - dense_1138_loss: 0.0395 - dense_1140_loss: 0.0199 - dense_1142_loss: 0.0166 - dense_1144_loss: 0.0291 - dense_1146_loss: 0.0309 - dense_1148_loss: 0.0324 - dense_1150_loss: 0.0287 - dense_1152_loss: 0.0351 - dense_1154_loss: 0.0191 - dense_1156_loss: 0.0191 - dense_1158_loss: 0.0219 - dense_1160_loss: 0.0210 - dense_1162_loss: 0.0225 - dense_1164_loss: 0.0231 - dense_1166_loss: 0.0268 - dense_1168_loss: 0.0180 - dense_1170_loss: 0.0184 - dense_1172_loss: 0.0367 - dense_1174_loss: 0.0141 - dense_1176_loss: 0.0362 - dense_1178_loss: 0.0496 - dense_1180_loss: 0.0291 - de

Epoch 7/30
 - 34s - loss: 31.7173 - dense_1108_loss: 0.0158 - dense_1110_loss: 0.0182 - dense_1112_loss: 0.0182 - dense_1114_loss: 0.0106 - dense_1116_loss: 0.0146 - dense_1118_loss: 0.0119 - dense_1120_loss: 0.0292 - dense_1122_loss: 0.0169 - dense_1124_loss: 0.0275 - dense_1126_loss: 0.0160 - dense_1128_loss: 0.0146 - dense_1130_loss: 0.0270 - dense_1132_loss: 0.0189 - dense_1134_loss: 0.0335 - dense_1136_loss: 0.0288 - dense_1138_loss: 0.0301 - dense_1140_loss: 0.0156 - dense_1142_loss: 0.0156 - dense_1144_loss: 0.0265 - dense_1146_loss: 0.0313 - dense_1148_loss: 0.0269 - dense_1150_loss: 0.0294 - dense_1152_loss: 0.0336 - dense_1154_loss: 0.0161 - dense_1156_loss: 0.0163 - dense_1158_loss: 0.0194 - dense_1160_loss: 0.0182 - dense_1162_loss: 0.0189 - dense_1164_loss: 0.0218 - dense_1166_loss: 0.0231 - dense_1168_loss: 0.0179 - dense_1170_loss: 0.0196 - dense_1172_loss: 0.0282 - dense_1174_loss: 0.0168 - dense_1176_loss: 0.0244 - dense_1178_loss: 0.0457 - dense_1180_loss: 0.0288 - de

Epoch 10/30
 - 40s - loss: 20.3169 - dense_1108_loss: 0.0145 - dense_1110_loss: 0.0109 - dense_1112_loss: 0.0191 - dense_1114_loss: 0.0084 - dense_1116_loss: 0.0148 - dense_1118_loss: 0.0105 - dense_1120_loss: 0.0271 - dense_1122_loss: 0.0179 - dense_1124_loss: 0.0304 - dense_1126_loss: 0.0189 - dense_1128_loss: 0.0156 - dense_1130_loss: 0.0242 - dense_1132_loss: 0.0168 - dense_1134_loss: 0.0341 - dense_1136_loss: 0.0262 - dense_1138_loss: 0.0271 - dense_1140_loss: 0.0163 - dense_1142_loss: 0.0201 - dense_1144_loss: 0.0263 - dense_1146_loss: 0.0267 - dense_1148_loss: 0.0289 - dense_1150_loss: 0.0223 - dense_1152_loss: 0.0335 - dense_1154_loss: 0.0131 - dense_1156_loss: 0.0174 - dense_1158_loss: 0.0212 - dense_1160_loss: 0.0184 - dense_1162_loss: 0.0151 - dense_1164_loss: 0.0215 - dense_1166_loss: 0.0229 - dense_1168_loss: 0.0166 - dense_1170_loss: 0.0233 - dense_1172_loss: 0.0262 - dense_1174_loss: 0.0160 - dense_1176_loss: 0.0324 - dense_1178_loss: 0.0362 - dense_1180_loss: 0.0292 - d

Epoch 13/30
 - 39s - loss: 15.8076 - dense_1108_loss: 0.0154 - dense_1110_loss: 0.0132 - dense_1112_loss: 0.0136 - dense_1114_loss: 0.0082 - dense_1116_loss: 0.0140 - dense_1118_loss: 0.0078 - dense_1120_loss: 0.0176 - dense_1122_loss: 0.0156 - dense_1124_loss: 0.0280 - dense_1126_loss: 0.0143 - dense_1128_loss: 0.0150 - dense_1130_loss: 0.0167 - dense_1132_loss: 0.0161 - dense_1134_loss: 0.0249 - dense_1136_loss: 0.0204 - dense_1138_loss: 0.0166 - dense_1140_loss: 0.0157 - dense_1142_loss: 0.0143 - dense_1144_loss: 0.0200 - dense_1146_loss: 0.0274 - dense_1148_loss: 0.0224 - dense_1150_loss: 0.0187 - dense_1152_loss: 0.0289 - dense_1154_loss: 0.0144 - dense_1156_loss: 0.0140 - dense_1158_loss: 0.0193 - dense_1160_loss: 0.0185 - dense_1162_loss: 0.0153 - dense_1164_loss: 0.0169 - dense_1166_loss: 0.0159 - dense_1168_loss: 0.0160 - dense_1170_loss: 0.0193 - dense_1172_loss: 0.0255 - dense_1174_loss: 0.0140 - dense_1176_loss: 0.0244 - dense_1178_loss: 0.0304 - dense_1180_loss: 0.0232 - d

Epoch 16/30
 - 39s - loss: 13.7891 - dense_1108_loss: 0.0155 - dense_1110_loss: 0.0144 - dense_1112_loss: 0.0124 - dense_1114_loss: 0.0126 - dense_1116_loss: 0.0169 - dense_1118_loss: 0.0065 - dense_1120_loss: 0.0155 - dense_1122_loss: 0.0148 - dense_1124_loss: 0.0168 - dense_1126_loss: 0.0150 - dense_1128_loss: 0.0138 - dense_1130_loss: 0.0164 - dense_1132_loss: 0.0128 - dense_1134_loss: 0.0208 - dense_1136_loss: 0.0208 - dense_1138_loss: 0.0182 - dense_1140_loss: 0.0152 - dense_1142_loss: 0.0161 - dense_1144_loss: 0.0163 - dense_1146_loss: 0.0206 - dense_1148_loss: 0.0161 - dense_1150_loss: 0.0210 - dense_1152_loss: 0.0232 - dense_1154_loss: 0.0150 - dense_1156_loss: 0.0119 - dense_1158_loss: 0.0177 - dense_1160_loss: 0.0167 - dense_1162_loss: 0.0148 - dense_1164_loss: 0.0175 - dense_1166_loss: 0.0140 - dense_1168_loss: 0.0102 - dense_1170_loss: 0.0152 - dense_1172_loss: 0.0191 - dense_1174_loss: 0.0136 - dense_1176_loss: 0.0235 - dense_1178_loss: 0.0293 - dense_1180_loss: 0.0209 - d

Epoch 19/30
 - 39s - loss: 12.6577 - dense_1108_loss: 0.0142 - dense_1110_loss: 0.0122 - dense_1112_loss: 0.0105 - dense_1114_loss: 0.0084 - dense_1116_loss: 0.0122 - dense_1118_loss: 0.0074 - dense_1120_loss: 0.0085 - dense_1122_loss: 0.0127 - dense_1124_loss: 0.0147 - dense_1126_loss: 0.0083 - dense_1128_loss: 0.0129 - dense_1130_loss: 0.0171 - dense_1132_loss: 0.0132 - dense_1134_loss: 0.0174 - dense_1136_loss: 0.0167 - dense_1138_loss: 0.0157 - dense_1140_loss: 0.0142 - dense_1142_loss: 0.0132 - dense_1144_loss: 0.0162 - dense_1146_loss: 0.0198 - dense_1148_loss: 0.0143 - dense_1150_loss: 0.0187 - dense_1152_loss: 0.0297 - dense_1154_loss: 0.0120 - dense_1156_loss: 0.0104 - dense_1158_loss: 0.0201 - dense_1160_loss: 0.0156 - dense_1162_loss: 0.0136 - dense_1164_loss: 0.0128 - dense_1166_loss: 0.0139 - dense_1168_loss: 0.0117 - dense_1170_loss: 0.0130 - dense_1172_loss: 0.0192 - dense_1174_loss: 0.0146 - dense_1176_loss: 0.0302 - dense_1178_loss: 0.0261 - dense_1180_loss: 0.0172 - d

Epoch 22/30
 - 41s - loss: 12.1470 - dense_1108_loss: 0.0140 - dense_1110_loss: 0.0144 - dense_1112_loss: 0.0089 - dense_1114_loss: 0.0110 - dense_1116_loss: 0.0131 - dense_1118_loss: 0.0081 - dense_1120_loss: 0.0123 - dense_1122_loss: 0.0164 - dense_1124_loss: 0.0142 - dense_1126_loss: 0.0078 - dense_1128_loss: 0.0120 - dense_1130_loss: 0.0151 - dense_1132_loss: 0.0132 - dense_1134_loss: 0.0136 - dense_1136_loss: 0.0164 - dense_1138_loss: 0.0117 - dense_1140_loss: 0.0108 - dense_1142_loss: 0.0117 - dense_1144_loss: 0.0154 - dense_1146_loss: 0.0146 - dense_1148_loss: 0.0128 - dense_1150_loss: 0.0357 - dense_1152_loss: 0.0217 - dense_1154_loss: 0.0161 - dense_1156_loss: 0.0098 - dense_1158_loss: 0.0206 - dense_1160_loss: 0.0131 - dense_1162_loss: 0.0143 - dense_1164_loss: 0.0136 - dense_1166_loss: 0.0144 - dense_1168_loss: 0.0091 - dense_1170_loss: 0.0134 - dense_1172_loss: 0.0162 - dense_1174_loss: 0.0151 - dense_1176_loss: 0.0275 - dense_1178_loss: 0.0265 - dense_1180_loss: 0.0188 - d

Epoch 25/30
 - 41s - loss: 11.8153 - dense_1108_loss: 0.0123 - dense_1110_loss: 0.0162 - dense_1112_loss: 0.0102 - dense_1114_loss: 0.0114 - dense_1116_loss: 0.0157 - dense_1118_loss: 0.0052 - dense_1120_loss: 0.0104 - dense_1122_loss: 0.0179 - dense_1124_loss: 0.0142 - dense_1126_loss: 0.0106 - dense_1128_loss: 0.0133 - dense_1130_loss: 0.0160 - dense_1132_loss: 0.0132 - dense_1134_loss: 0.0137 - dense_1136_loss: 0.0187 - dense_1138_loss: 0.0123 - dense_1140_loss: 0.0161 - dense_1142_loss: 0.0117 - dense_1144_loss: 0.0149 - dense_1146_loss: 0.0191 - dense_1148_loss: 0.0163 - dense_1150_loss: 0.0213 - dense_1152_loss: 0.0343 - dense_1154_loss: 0.0129 - dense_1156_loss: 0.0121 - dense_1158_loss: 0.0164 - dense_1160_loss: 0.0125 - dense_1162_loss: 0.0150 - dense_1164_loss: 0.0080 - dense_1166_loss: 0.0110 - dense_1168_loss: 0.0175 - dense_1170_loss: 0.0165 - dense_1172_loss: 0.0170 - dense_1174_loss: 0.0109 - dense_1176_loss: 0.0228 - dense_1178_loss: 0.0282 - dense_1180_loss: 0.0164 - d

Epoch 28/30
 - 41s - loss: 11.4433 - dense_1108_loss: 0.0111 - dense_1110_loss: 0.0101 - dense_1112_loss: 0.0087 - dense_1114_loss: 0.0091 - dense_1116_loss: 0.0085 - dense_1118_loss: 0.0058 - dense_1120_loss: 0.0100 - dense_1122_loss: 0.0145 - dense_1124_loss: 0.0112 - dense_1126_loss: 0.0090 - dense_1128_loss: 0.0134 - dense_1130_loss: 0.0150 - dense_1132_loss: 0.0151 - dense_1134_loss: 0.0183 - dense_1136_loss: 0.0245 - dense_1138_loss: 0.0124 - dense_1140_loss: 0.0148 - dense_1142_loss: 0.0163 - dense_1144_loss: 0.0116 - dense_1146_loss: 0.0151 - dense_1148_loss: 0.0139 - dense_1150_loss: 0.0188 - dense_1152_loss: 0.0142 - dense_1154_loss: 0.0118 - dense_1156_loss: 0.0159 - dense_1158_loss: 0.0210 - dense_1160_loss: 0.0128 - dense_1162_loss: 0.0111 - dense_1164_loss: 0.0095 - dense_1166_loss: 0.0104 - dense_1168_loss: 0.0125 - dense_1170_loss: 0.0139 - dense_1172_loss: 0.0120 - dense_1174_loss: 0.0104 - dense_1176_loss: 0.0276 - dense_1178_loss: 0.0244 - dense_1180_loss: 0.0181 - d

Duration for fitting =  1159.0961771011353
More than  11000  predictions have been parsed.  
Performance for MTL-D NN = 0.110222


## Random Forests

In [18]:
from scipy.stats import sem, t

def Imputer(model, NewID, Fingerprints, threshold=0.2):
    
    X_new = []
    for cid in NewID:
        x_test = np.array( Fingerprints[cid] ).reshape(1,-1)
        preds=[]
        for DTR in model.estimators_:
            preds.append(DTR.predict(x_test) )
#         if np.std(preds)<=threshold:
#             X_new.append( [ cid, np.mean(preds)] )
        std_err = sem(preds)
        h = std_err * t.ppf((1 + 0.95) / 2, len(preds) - 1)
        if 2*h<=threshold:
            X_new.append( [ cid, np.mean(preds)] )
    if len(X_new)>0:
        print("{0} new values were imputed!".format(len(X_new)))
    else:
        print("No confident values were found.")
    return X_new

def Evaluate( TARGET, MODEL, validationset, prnt=False ):
    True_temp = []; Pred_temp = []
    with open( validationset, 'r') as file:
        # no header on this file
        for line in file:
            tokens = line.split()
            if tokens[0]==TARGET:
                True_temp.append( float(tokens[2]) )
                x_test = np.array( Fingerprints[tokens[1]] ).reshape(1,-1)
                Pred_temp.append( MODEL.predict( x_test ) )
    r2 = r2_score(True_temp,Pred_temp)
    if prnt:
        print("R2-score after {0} points = {1:.4f} ".format(len(True_temp), r2 ) )
    return r2

In [20]:
Target_info = {} # this is a "global" variable
theta=0.1
count=0
# param_grid={ 'max_depth':[4,5,7,10,12], 'n_estimators':[25,50,100,150],} # 'max_features':['sqrt','auto']
for target in Targets:
    Target_info[target] = {}
#         

    # define the train set
    X_train=[]; Y_train=[]
    Train_CIDs = []
    for point in Interactions_train:
        if point[0]==target:
            X_train.append( Fingerprints[point[1]] )
            Y_train.append( float(point[2]) )
            Train_CIDs.append( point[1] )
    Target_info[target]['train_size']=len(Y_train) # add info
    
    with open( 'ECFP/TrainedModels/RF_'+target+'_'+'pIC50new.sav', 'rb') as f:
            MODEL = pickle.load( f )
            
#     if os.path.isfile('TrainedModels/RF_'+target+'_'+'pIC50before.sav'):
#         # model is already trained - load:
#         with open( 'TrainedModels/RF_'+target+'_'+'pIC50before.sav', 'rb') as f:
#             MODEL = pickle.load( f )
#     else:
#         print("Selecting parameters with CV")
#         cvr = GridSearchCV(RandomForestRegressor(random_state=2019), param_grid, cv=nfolds, n_jobs=njobs, iid=True)
#         # here comes the long step
#         cvr.fit(X_train, Y_train)
#         # select best parametrisation and train to the complete train-set
#         MODEL = RandomForestRegressor( n_estimators= cvr.best_params_['n_estimators'],max_depth=cvr.best_params_['max_depth'], random_state=2019)
#         # now save:
#         pickle.dump(MODEL, open('TrainedModels/RF_'+target+'_'+'pIC50before.sav', 'wb'))

    # MODEL = KerasRegressor(build_fn=mymodel, init=-4.5, lamda=0.2, epochs=250, batch_size=20, verbose=0)
    # X_train = np.array( X_train )
    MODEL.fit(X_train,Y_train)
    Target_info[target]['RF_train_r2'] = MODEL.score( X_train,  Y_train) # add info
    
    True_temp = []
    Pred_temp = []
    for point in Interactions_valid:
        if point[0]==target:
            True_temp.append( float(point[2]) )
            x_test = np.array( Fingerprints[point[1]] ).reshape(1,-1)
            Pred_temp.append( MODEL.predict( x_test ) )
    Target_info[target]["first_r2"] = r2_score(True_temp,Pred_temp)
    print("Evaluation without imputation = %.4f " % Target_info[target]["first_r2"] )
    
#     print("Imputing confident values...")
    X_new = [1] # just a trigger for the next loop
    while  (len(X_new)>0) & (len(Train_CIDs)<2000):
    # we need to stop after we have no new predictions or we have enough (10K+)
        # update the train set
        NewIDs = [x for x in Compounds if x not in Train_CIDs] # terra incognito
        X_new = Imputer(MODEL, NewIDs, Fingerprints, threshold=theta)
        
        for point in X_new:
            X_train.append( Fingerprints[point[0]] )
            Y_train.append( float(point[1]) )
            Train_CIDs.append( point[0] )
        # re-train
        MODEL.fit(np.array( X_train ),Y_train)
        
    # RE-SELECT parameters
#     cvr = GridSearchCV(RandomForestRegressor(random_state=2019), param_grid, cv=nfolds, n_jobs=njobs, iid=True)
#     cvr.fit(X_train, Y_train)
    # select best parametrisation and train to the complete train-set
#     MODEL = RandomForestRegressor( n_estimators= cvr.best_params_['n_estimators'], max_depth=cvr.best_params_['max_depth'], random_state=2019)
#     MODEL.fit(np.array( X_train ),Y_train)
    Target_info[target]["model"] = MODEL
    # evaluate again as before 
    True_temp = []
    Pred_temp = []
    for point in Interactions_valid:
        if point[0]==target:
            True_temp.append( float(point[2]) )
            x_test = np.array( Fingerprints[point[1]] ).reshape(1,-1)
            Pred_temp.append( MODEL.predict( x_test ) )
    Target_info[target]["after_r2"] = r2_score(True_temp,Pred_temp)
    print("Re-evaluate after imputation: %.4f " % Target_info[target]["after_r2"] )
    
    if count%25==0:
        print("More than %d targets are processed" % count)
#         print("Mean score so far: %f" % np.mean(Scores_RF_train))
    count+=1
    
# print("Mean score for RF during training = %f" % np.mean(Scores_RF_train) )

Evaluation without imputation = 0.4265 
No confident values were found.
Re-evaluate after imputation: 0.4265 
More than 0 targets are processed
Evaluation without imputation = 0.3326 
No confident values were found.
Re-evaluate after imputation: 0.3326 
Evaluation without imputation = 0.4879 
No confident values were found.
Re-evaluate after imputation: 0.4879 
Evaluation without imputation = 0.6611 
No confident values were found.
Re-evaluate after imputation: 0.6611 
Evaluation without imputation = 0.0597 
No confident values were found.
Re-evaluate after imputation: 0.0597 
Evaluation without imputation = 0.9904 
No confident values were found.
Re-evaluate after imputation: 0.9904 
Evaluation without imputation = 0.3546 


KeyboardInterrupt: 

In [19]:
T1 = [Target_info[t]['first_r2']  for t in Targets[40:]]
T2 = [Target_info[t]['after_r2']  for t in Targets[40:]]

plt.plot(T1,T2,'.')
plt.plot([-0.2,1],[-0.2,1])

NameError: name 'Target_info' is not defined