In [None]:
import tensorflow as tf
import os


import sys
import keras


from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Embedding, LSTM, Bidirectional,Multiply 
from keras.layers import BatchNormalization, merge, add
from keras.layers.core import Flatten, Reshape
from keras.layers.merge import Concatenate, concatenate, subtract, multiply
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D, AveragePooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D

from keras.optimizers import Adam,  RMSprop

import keras.backend.tensorflow_backend as KTF

import numpy as np
from tqdm import tqdm
from keras.layers import Input, CuDNNGRU, GRU
from numpy import linalg as LA
import scipy
from keras import backend as K
import re
from sklearn.model_selection import train_test_split
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

config = tf.ConfigProto()
config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
sess = tf.Session(config=config)

KTF.set_session(sess)

In [None]:
from keras.layers import Lambda,Add, CuDNNGRU,TimeDistributed, Bidirectional,Softmax
from keras import regularizers
from keras.regularizers import l2
import tensorflow as tf
from keras import regularizers

smilen = 256
hidden_dim = 256


def se_block(input, channels, r=8):
    x = GlobalAveragePooling1D()(input)
    x = Dense(channels//r, activation="relu")(x)
    x = Dense(channels, activation="sigmoid")(x)
    return Multiply()([input, x])



def conv_block(inputs, seblock, NUM_FILTERS,FILTER_LENGTH1):   
    conv1_encode = Conv1D(filters=NUM_FILTERS, kernel_size=FILTER_LENGTH1,   activation='relu', padding='valid', strides=1)(inputs)
    if seblock: 
        conv1_encode = se_block(conv1_encode,NUM_FILTERS)
    
    return conv1_encode
 
    
def build_model():
    drugInput = Input(shape=(smilen,hidden_dim))
    seblock = True 
    NUM_FILTERS = 2048
    FILTER_LENGTH1 = 1
    n_layers = 4
    
    encode_smiles = conv_block(drugInput, seblock, NUM_FILTERS,FILTER_LENGTH1) 
    encode_smiles = GlobalMaxPooling1D()(encode_smiles) 

    FC1 = Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01))(encode_smiles)
    
    predictions = Dense(1, kernel_initializer='normal')(FC1) #OR no activation, rght now it's between 0-1, do I want this??? activation='sigmoid'

    interactionModel = Model(inputs=[drugInput], outputs=[predictions])
    #opt = Adam(lr=0.0001)
    interactionModel.compile(optimizer='adam', loss='mse', metrics=['mse'])
    return interactionModel


model = build_model()
print(model.summary())

In [None]:
import numpy as np
embX = np.load('../../../encoded-dataset/regression/ESOL_embX_fullBERT.npy', allow_pickle=True)
newY = np.load('../../../encoded-dataset/regression/ESOL_Y_fullBERT.npy', allow_pickle=True)

X = np.zeros((len(embX), smilen, hidden_dim))
print(X.shape)

for i, x in enumerate(embX):
    if(len(x) > smilen):
        X[i] = x[:smilen]
    else:
        X[i,:len(x)] = x

print(newY.shape)

In [None]:
# 运行5次
from keras.callbacks import ModelCheckpoint, EarlyStopping,ReduceLROnPlateau
from sklearn.metrics import mean_squared_error, roc_auc_score
from keras.callbacks import ReduceLROnPlateau
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
rmses = []
stds = []
test_y_all = []
pred_y_all = []
# dataset_folds = []
for i in range(5):
    save_model_name = f'models-regression-ESOL-TT8_run_{i+1}_full4'

    model = build_model()

    save_checkpoint = ModelCheckpoint(save_model_name, verbose=1,
                                      save_best_only=True, 
                                      monitor='val_loss', 
                                      save_weights_only=True, mode='min') 
    earlyStopping = EarlyStopping(monitor='val_loss', patience=100, verbose=1,mode='min')
    lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=50, mode='min', verbose=1)

#     (train_x, train_y, valid_x, valid_y, test_x, test_y), std = get_dataset()
    train_x, test_x, train_y, test_y = train_test_split(X, newY, test_size=0.1, random_state=42+i)
    train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.1111, random_state=42+i)
#     dataset_folds.append([train_x, train_y, valid_x, valid_y, test_x, test_y])
    model.fit(x=train_x, y=train_y, batch_size=256, epochs=1000, verbose=1, validation_data=(valid_x,valid_y), callbacks=[earlyStopping, save_checkpoint, lr_reduce])
    # model.fit(x=train_x, y=train_y, batch_size=256, epochs=1000, verbose=1, validation_split=0.1111, callbacks=[earlyStopping, save_checkpoint, lr_reduce])

    model.load_weights(save_model_name)
    y_pred = model.predict([test_x])
    rmse = mean_squared_error(test_y, y_pred, squared=False)
    print(rmse)
#     print(std)
#     print(rmse*std)
    rmses.append(rmse)
    stds.append(std)
    
    test_y_all.append(test_y)
    pred_y_all.append(y_pred.reshape(-1))

In [None]:
rmses,np.mean(rmses),np.std(rmses)

In [None]:
# 绘图
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
for i in range(5):
    save_model_name = f'models-regression-ESOL-TT8_run_{i+1}_full4'
    
    true = test_y_all[i].reshape(-1)
    pred = pred_y_all[i].reshape(-1)
    df = pd.DataFrame({
        'Measured': true,
        'Predicted': pred })

    # plt.style.use(['science','no-latex'])
    plt.figure(figsize=(15,15))
    plt.rcParams.update({'font.size': 10})
    gfg = sns.jointplot(x="Predicted", 
                  y="Measured", 
                  data=df,
                  # kind="reg",
                  color="k", xlim = (-25,10), ylim = (-25,10),
                  marginal_kws=dict(bins=50,color='b'));
    
    
    plt.savefig(f"ESOL_{i+1}_full4.png")
    plt.show()

In [None]:
from sklearn.metrics import r2_score
from scipy.stats import pearsonr 
from scipy.stats import ttest_rel
for i in range(5):
    save_model_name = f'models-regression-ESOL-TT8_run_{i+1}_full4'
    model = build_model()
    model.load_weights(save_model_name)
    # train_x, train_y, valid_x, valid_y, test_x, test_y = dataset_folds[i]    
    train_x, test_x, train_y, test_y = train_test_split(X, newY, test_size=0.1, random_state=42+i)
    true = test_y_all[i].reshape(-1)
    pred = pred_y_all[i].reshape(-1)
    y_pred_train = model.predict([train_x])
    rmse_value = mean_squared_error(true,pred, squared=False)
    
    r2, q2, pearson_cc, p_value = r2_score(train_y,y_pred_train.reshape(-1)),r2_score(true,pred),pearsonr(true,pred)[0],ttest_rel(true,pred).pvalue
    print(r2,',',q2, ',',pearson_cc, ',',p_value)
    print(pearsonr(true,pred)[1])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
for i in range(5):
    save_model_name = f'models-regression-ESOL-TT8_run_{i+1}_full4'
    true = test_y_all[i].reshape(-1)
    pred = pred_y_all[i].reshape(-1)
    

    # plt.style.use(['science','no-latex'])
    # plt.figure(figsize=(15,15))
    plt.scatter(pred,pred-true,c="green",marker=".")
    plt.hlines(y=0,xmin=-25,xmax=10,lw=2,color="red")
    plt.xlabel("Predicted")
    plt.ylabel("Residual")
    
    plt.savefig(f"ESOL_{i+1}_full3_res.png")
    plt.show()