In [1]:
import keras
import keras.backend as K
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor # this is for making a model like every other in scikit
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
from time import time
import matplotlib.pyplot as plt

from keras.models import Model
from sklearn.model_selection import GridSearchCV

# import tensorflow as tf
# config = tf.ConfigProto(device_count={"CPU": 15})

Using TensorFlow backend.


### Load data

In [2]:
Interactions_train = []    
with open("Interactions_Trainset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_train.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions_valid = []        
with open("Interactions_Validset.tab",'r') as f:
    for line in f:
        tokens = line.split()
        # 'Target-ID', 'Compound-ID', 'pIC50'  
        Interactions_valid.append( [tokens[0], tokens[1], float(tokens[2]) ])

Interactions = [x for x in Interactions_train]
Interactions.extend(Interactions_valid)
# we use a dataframe to quickly sort targets wrt #compounds:
DF = pd.DataFrame( Interactions, columns =['Target-ID', 'Compound-ID','Std-value']) 
temp = DF.groupby(['Target-ID']).agg('count').sort_values(by='Compound-ID') # count the number of molecules
Targets = list(temp.index)
Compounds = np.unique(DF['Compound-ID'])
del temp, DF

nT=len(Targets); nC=len(Compounds)

print("There are {0} targets and {1} compounds currently loaded with {2} interactions.".format(nT,nC,len(Interactions)))

values = [x[2] for x in Interactions]
print("New data: {0} | {1}".format(np.mean(values), np.std(values)))

# load fingerprints and prepare as feature vectors
Fingerprints={} # this contains one list per fingerprint - not efficient...
with open('Compound_Fingerprints.tab', 'r') as f:
    header = f.readline()
    for line in f:
        # each line is Comp-ID, SMILES, FP
        tokens = line.split()
        # we keep only those compounds which have FPs
        if tokens[2] != 'NOFP':
            fp = [int(c) for c in tokens[2] ]
            Fingerprints[ tokens[0] ] = fp
print("%d fingerprints were loaded!" % len(Fingerprints))

# split to train/test data
random_seed = 2019
np.random.seed(random_seed)

print("The sizes for train and validation sets are {0} and {1} respectivelly".format( len(Interactions_train), len(Interactions_valid) ))

Labels_Targ = dict()
indx=0
for x in Targets:
    Labels_Targ[x]=indx
    indx+=1
    
Labels_Comp = dict()
indx=0
for x in Compounds:
    Labels_Comp[x]=indx
    indx+=1

There are 110 targets and 23167 compounds currently loaded with 56392 interactions.
New data: -4.604582905766776 | 2.5887050795505413
23167 fingerprints were loaded!
The sizes for train and validation sets are 45114 and 11278 respectivelly


## Dropout with MTL

In [20]:
def masked_loss_function(y_true, y_pred, MissingVal=10):
    # This function masks the elements of the vectors with true/predicted values so that the model focuses
    # only on the known data. By default, missing values are represented by 10
    mask = K.cast(K.not_equal(y_true, MissingVal), K.floatx())
    return keras.losses.mean_squared_error(y_true * mask, y_pred * mask)

def MTL_Drop( wsl, whl, drop_rate=0.1, lr=0.001):
    # a function that creates a NN with dropout incorporated after the first hidden layer
    inputs = keras.Input(shape=(2048,))
    sharedlayer = keras.layers.Dense(wsl, activation='tanh' )(inputs) 
    dropout= keras.layers.Dropout(drop_rate)(sharedlayer, training=True)
    myinit = keras.initializers.Constant(-4.)
    hidden = []
    for i in range(len(Targets)):
        hl = Dense(units=whl,  activation='tanh', kernel_regularizer=regularizers.l2(0.02) )(dropout)
        hidden.append( Dense(1, kernel_initializer=myinit, activity_regularizer=regularizers.l1(0.001) )(hl) )

    MTL=Model(inputs=inputs, outputs=hidden)
    MTL.compile(loss=masked_loss_function, optimizer=keras.optimizers.adam(lr=lr))
    return MTL

from scipy.stats import sem
from scipy.stats import t as tstat

def mulpredict(model, x_test, Ntargets, N=10, conf_flag=False):
    preds = np.zeros( (N, Ntargets) )
    for i in range(N):
        preds[i,:] = [ x[0][0] for x in model.predict( x_test ) ]
    if conf_flag:
        std_err = sem(preds, axis=0)
        h = std_err * tstat.ppf((1 + 0.95) / 2, len(preds) - 1)
        return np.mean(preds, axis=0), h
        # we need the column-wise average of this matrix
    else:
        return np.mean(preds, axis=0) 

def Evaluate(Inter_list, Comp_list, Model, Fingerprints, Ntar=110, Niter=10):
    Predictions = []
    Percomp = {} # contains dicts with lists: (target: [true, pred_NN] )
    for test_case in Comp_list:
        Percomp[ test_case ] = {}
        for tokens in Inter_list:
            if tokens[1]==test_case:
                Percomp[test_case][ tokens[0] ] = [ tokens[2] ]
        if len(Percomp[ test_case ])>0:
            # we've got some values for this compound, now produce predictions:
            preds = mulpredict(Model, np.array( Fingerprints[test_case]).reshape(1,-1), Ntar, Niter)
            
            for target in Percomp[test_case]: 
                Percomp[test_case][target].append( preds[Labels_Targ[target]])
                Predictions.append( [target,test_case, Percomp[test_case][target][0], Percomp[test_case][target][1] ])

        if len(Predictions) % 1000 == 0:
            r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
            print(f"\rMore than ", len(Predictions)," pairs have been parsed. Mean performance so far =",r2, end=" ")
    print(" ")
    r2 = r2_score([x[2] for x in Predictions], [x[3] for x in Predictions])
    print("Performance for MTL-D NN = %f" % r2)
    return Predictions

# Initialize sparse matrix - this will be binary
DTI = 10*np.ones((nC,nT),dtype=float)

for edge in Interactions_train:
    # each edge has "target-compound-value-active"
    DTI[ Labels_Comp[edge[1]], Labels_Targ[edge[0]] ] = edge[2]
DTI.shape

(23167, 110)

### Train model and evaluate without self-training

In [4]:
BS=64; Nepochs=50
param_grid={'wsl':[2000,300,200],'whl':[100,50,20], 'drop_rate':[0.05,0.1,0.2]}

Loss_history = {}
best_loss = 10
best_params =  {} #{'whl':0, 'lamda':0, 'lamda':}
with open("Cross-val-MTLdrop.txt", 'w') as f:
    for wsl in param_grid['wsl']:
        for whl in param_grid['whl']:
            for dr in param_grid['drop_rate']:
                MTLR = MTL_Drop(wsl=wsl, whl=whl, drop_rate=dr )
                MTLR.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=Nepochs, batch_size=BS, validation_split=0.25, verbose=0, use_multiprocessing=True )
                loss = np.mean( [MTLR.history.history[x][-1] for x in list(MTLR.history.history)[1:]] )
                print("Fitting for (shared, hidden, drop_rate)=({0},{1},{2}) is {3}".format(wsl, whl, dr, loss) )
                f.write("Fitting for (shared, hidden, drop_rate)=({0},{1},{2}) is {3}\n".format(wsl, whl, dr, loss))
                if loss < best_loss:
                    best_loss = loss
                    best_params['wsl'] = wsl
                    best_params['whl'] = whl
                    best_params['drop_rate'] = dr
                Loss_history[(wsl,whl,dr)] = MTLR.history.history




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



Fitting for (shared, hidden, drop_rate)=(2000,100,0.05) is 0.3217461966244625
Fitting for (shared, hidden, drop_rate)=(2000,100,0.1) is 0.24812095383630348
Fitting for (shared, hidden, drop_rate)=(2000,100,0.2) is 0.2399123148197154
Fitting for (shared, hidden, drop_rate)=(2000,50,0.05) is 0.2110930841062049
Fitting for (shared, hidden, drop_rate)=(2000,50,0.1) is 0.21500517423888846
Fitting for (shared, hidden, drop_rate)=(2000,50,0.2) is 0.2202118748500931
Fitting for (shared, hidden, drop_rate)=(2000,20,0.05) is 0.182486101722071
Fitting for (shared, hidden, drop_rate)=(2000,20,0.1) is 0.19856408089302996
Fitting for (shared, hidden, drop_rate)=(2000,20,0.2) is 0.199424985808931
Fitting for (shared, hidden, drop_rate)=(300,100,0.05) is 0.5050347980912663
Fitting for (shared, hidden, drop_rate)=(300,100,0.1) is 0.35869142568025625
Fitting for (shared, hidden, drop_

In [21]:
# best_params = {'wsl': 200, 'whl': 20, 'drop_rate': 0.05}

In [6]:
# New = {}
# for combo in Loss_history.keys():
#     New[combo] = np.mean(Loss_history[combo][-1][:110])
#     print(combo,'\t', np.mean(Loss_history[combo][-1][:110]))
# # get the best parametrisation
# i=0; max_mean = {}
# for param in param_grid:
#     best=100
#     for val in param_grid[param]:
#         estim = []
#         for combo in New:
#             if combo[i]==val:
#                 estim.append( New[combo])
#         print(param,'\t',val,'\t',np.mean(estim))
#         if np.mean(estim)<best:
#             best = np.mean(estim)
#             max_mean[param] = val
#     i+=1
# print(max_mean)

KeyError: -1

In [17]:
combo = (200,20,0.05)
keys = list(Loss_history[combo])
keys.remove('val_loss')
keys.remove('loss')
print(Loss_history[combo]['val_loss'][-1])
s=0
for x in keys[:110]:
    s+=Loss_history[combo][x][-1]
print(s)
# len( [Loss_history[(2000,100,  0.05)][x] for x in range(2,110) ])

28.29037669482152
21.512064087587273


In [18]:
np.mean ([ Loss_history[combo][x][-1] for x in keys[:90] ])

0.1316437867241376

In [None]:
plt.plot( [ Loss_history[combo][x][-1] for x in keys[:110] ] )

In [None]:
plt.plot( Loss_history[combo][keys[100]] )
plt.plot( Loss_history[combo][keys[211]] )

In [None]:
aver = np.zeros((1,30))
aver_v = np.zeros((1,30))

for x in np.arange(4623,4641,2):
#     plt.plot(Loss_history[combo]['val_dense_'+str(x)+'_loss'])
    plt.plot(Loss_history[combo]['dense_'+str(x)+'_loss'])

#     aver+=Loss_history[combo]['dense_'+str(x)+'_loss']
#     aver_v+=Loss_history[combo]['val_dense_'+str(x)+'_loss']

plt.plot(aver[0], label='Train')
plt.plot(aver_v[0], label='Valid')

In [None]:
Loss_history[combo]

In [None]:
layers = list(MTLR.history.history.keys())
layers.remove("val_loss")
layers.remove("loss")

In [None]:
# max_mean={ 'wsl':200, 'whl':30, 'lamda':0.1}
max_mean = best_params
# Nepochs = 30; BS=64; lr=0.0001
wsl=max_mean['wsl']; whl=max_mean['whl']; dr=max_mean['drop_rate']
print(wsl,whl,dr,Nepochs,BS)

# we assume that CV for model selection has already been performed!
MTLD = MTL_Drop(wsl,whl,dr,0.0001)
t0=time()
MTLD.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=30, batch_size=64, verbose=0, use_multiprocessing=True )
print("Duration for fitting = ", time()-t0)
temp = Evaluate( Interactions_valid, Compounds, MTLD, Fingerprints)

200 20 0.05 50 64


In [None]:
theta=0.10
DTI_new = 10*np.ones((nC,nT),dtype=float)
for edge in Interactions_train:
    # each edge has "target-compound-value-active"
    DTI[ Labels_Comp[edge[1]], Labels_Targ[edge[0]] ] = edge[2]
DTI.shape
# print("Initial training and evaluation...")
train_size = len(Interactions_train)
count = 1 # just a trigger for the next loop
while (train_size<0.5e6) & (count>0):
# we need to stop after we have no new predictions or we have enough (10K+)
    count=0
    for x_new in Compounds:
        preds, H = mulpredict(MTLD, np.array( Fingerprints[x_new]).reshape(1,-1), 110, 20, True)
        # impute accordingly
        for t in range(110):
            if (H[t]< theta) & (DTI[Labels_Comp[x_new],t] == 10):
                DTI[Labels_Comp[x_new],t] = preds[t] # update the train set
                count+=1
        if list(Compounds).index(x_new) % 100 == 0:
            print(f"\rMore than", list(Compounds).index(x_new) ,"compounds have been parsed with ",count,"new values.", end =" ")
    print(count," new values where imputed.")
    train_size += count
    
    if count >0 :
        t0 = time()
        MTLD.fit( np.array([Fingerprints[x] for x in Compounds]), [x for x in DTI.T], epochs=30, batch_size=128, verbose=2, use_multiprocessing=True )
        print("Duration for fitting = ", time()-t0)
        temp = Evaluate( Interactions_valid, Compounds, MTLD, Fingerprints)
    # repeat if enough points were predicted confidently


In [None]:
# Architecture	: 200   20   0.1
# Training prm	: 0.0001   30   128
# Duration =  508.8630199432373
# Performance for MTL-D NN = 0.495982

# Architecture	: 2000   50   0.1
# Training prm	: 0.0001   30   128
# Training duration =  613.2853746414185
# Performance for MTL-D NN = 0.224910

# Architecture	: 300   100   0.05
# Training prm	: 0.0001   30   128
# Training duration =  459.5082895755768
# Performance for MTL-D NN = 0.413208
# MSE= 3.8852603312554295

# Architecture	: 300   100   0.05
# Training prm	: 0.0001   30   128
# Training duration =  439.01910161972046
# Performance for MTL-D NN = 0.413178
# MSE= 3.885457687501225

# Architecture	: 200   20   0.05
# Training prm	: 0.0001   30   128
# Training duration =  452.59373211860657
# Performance for MTL-D NN = 0.486700
# MSE= 3.398656832966635

