In [29]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMClassifier,LGBMRegressor
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.metrics import roc_curve,roc_auc_score, confusion_matrix, precision_recall_curve, auc, mean_squared_error, \
    r2_score, mean_absolute_error,cohen_kappa_score,accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score
import pickle

def Standardize(col):
    return (col - np.mean(col)) / np.std(col)

def GetPreTable(trueDate,modelName,repetitions,path,DateDesc):
    sub_predate = DateDesc
#     sub_predate = sub_predate.apply(Standardize, axis=0)
#     sub_predate = sub_predate.fillna(0)
    predict_data = pd.DataFrame()
    for i in range(repetitions):  
        modelpath = path+'/{}_{}.pkl'.format(modelName,i+1)
        model= pickle.load(open(modelpath, "rb"))
        tr_pred = model.predict_proba(sub_predate)
        y_test = trueDate['label'].tolist()
        predict_data['smiles'] = trueDate['SMILES'].to_list()
        predict_data['Test'] = y_test
        predict_data['label_model_'+str(i+1)] = tr_pred[:, 1]
    predict_data['label'] = predict_data.iloc[:,2:].mean(axis=1)
    return predict_data

def Statistical(y_true, y_pred, y_pro):
    c_mat = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = list(c_mat.flatten())
    se = tp / (tp + fn)
    sp = tn / (tn + fp)
    auc_prc = auc(precision_recall_curve(y_true, y_pro, pos_label=1)[1],
                  precision_recall_curve(y_true, y_pro, pos_label=1)[0])
    acc = (tp + tn) / (tn + fp + fn + tp)
#     acc_skl = accuracy_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pro)
#     recall = se
# #     recall_skl = recall_score(y_true, y_pred)
#     precision = tp / (tp + fp)
# #     precision_skl = precision_score(y_true, y_pred)
#     f1 = 2 * (precision * recall) / (precision + recall) # F1 = 2 * (precision * recall) / (precision + recall)
# #     f1_skl = f1_score(y_true, y_pred)
#     kappa = cohen_kappa_score(y_true,y_pred)
#     mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) + 1e-8)
#     mcc_skl = matthews_corrcoef(y_true,y_pred)

    acc = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)   
    mcc = matthews_corrcoef(y_true, y_pred)

    scores_dict = {}
    scores_dict['acc'] = acc
    scores_dict['auc_roc'] = auc_roc
    scores_dict['recall'] = recall
    scores_dict['precision'] = precision
    scores_dict['f1'] = f1
    scores_dict['kappa'] = kappa
    scores_dict['mcc'] = mcc 
    scores_dict['auc_prc'] = auc_prc
    import collections
    scores_dict = collections.OrderedDict(scores_dict)
    
    return scores_dict

def GetPreMetricTabel(repetitions,preDate,tureData,name,result_path):
    smiles = tureData['SMILES'].to_list()
    y_true = tureData['label'].to_list()
    predict_data = pd.DataFrame()
    predict_data['smiles'] = smiles
    predict_data['Test'] = y_true
    pre_scores = []
    y_pro_avg = preDate['label']
    y_pred_avg = [1 if p > 0.5 else 0 for p in y_pro_avg]  
    avg_scores_dict = Statistical(y_true, y_pred_avg, y_pro_avg)
    avg_scores_dict = pd.DataFrame(avg_scores_dict,index=['avg'])
    for i in range(repetitions):
        y_pro = preDate['label_model_'+str(i+1)]
        y_pred = [1 if p > 0.5 else 0 for p in y_pro]
        predict_data['label_model_'+str(i+1)] = y_pred
        scores_dict = Statistical(y_true, y_pred, y_pro)
        pre_scores.append(scores_dict)

    predict_data['label'] =  y_pred_avg
    predict_data.to_csv(result_path+name+'_Cal_Metric_Data_bak.csv',index = False)
    
    data_df = pd.DataFrame(pre_scores)
    data_df.loc['Mean'] = data_df.mean()
    data_df.loc['Std'] = data_df.std()
    data_df = pd.concat([data_df, avg_scores_dict], axis=0)
    data_df.to_csv(result_path+name+'_Cal_Metric_Summarize.csv',index = False)
    return data_df

In [30]:
trueDate = pd.read_csv('1_AR_AlvaSlim_DMPNN_32_Normalize.csv').iloc[:,:3]
DateDesc = pd.read_csv('1_AR_AlvaSlim_DMPNN_32_Normalize.csv').iloc[:,3:]
path = 'model'
repetitions = 10
result_path = 'preresult/'

In [31]:
DateDesc

Unnamed: 0,MW,AMW,Sv,nAA,nTA,RBN,nDB,nTB,nN,nO,...,290,291,292,293,294,295,296,297,298,299
0,0.129193,-0.521794,0.443,1.118062,1.159369,-0.491768,-0.974725,-0.270489,-0.787971,-0.21222,...,0.0,0.203762,0.437005,0.000185,0.188031,0.174918,0.133945,0.180805,0.183998,0.003958
1,0.33654,0.925094,-0.039085,1.118062,-0.166306,-0.491768,-0.355202,-0.270489,0.473609,-0.589913,...,0.000511,0.084904,0.599218,0.0,0.199417,0.13238,0.21017,0.207585,0.205435,0.01884
2,1.999308,3.758058,0.117511,1.118062,1.159369,-0.491768,0.264322,-0.270489,-0.787971,0.543165,...,0.018513,0.209606,0.428925,0.0,0.218232,0.221193,0.063064,0.240961,0.168406,0.008145
3,-0.296136,-0.758897,0.05684,0.046967,-0.829144,0.899702,-0.974725,-0.270489,-0.787971,-0.589913,...,0.0,0.161534,0.511466,0.0,0.314601,0.122575,0.288884,0.17633,0.210839,0.007032
4,0.022861,-0.408358,0.274926,1.118062,0.165113,-0.027945,-0.355202,-0.270489,-0.787971,-0.21222,...,0.0,0.124338,0.517412,0.0,0.174603,0.11066,0.250206,0.188304,0.221731,0.010952
5,-0.056888,-0.447827,0.192939,1.118062,0.496531,-0.491768,-0.974725,-0.270489,-0.787971,-0.21222,...,0.0,0.168837,0.486069,0.0,0.168442,0.141357,0.174338,0.187201,0.204966,0.003129
6,-0.337339,0.039244,-0.292425,1.118062,-0.497725,-0.491768,-0.355202,-0.270489,-0.787971,0.165472,...,0.0,0.006576,0.649198,0.0,0.148366,0.092948,0.307393,0.257476,0.269249,0.0
7,0.089318,0.096255,0.031424,1.118062,0.496531,-0.491768,0.264322,-0.270489,-0.787971,0.543165,...,0.006973,0.124029,0.505052,0.0,0.202614,0.159425,0.172198,0.225511,0.252706,0.0
8,0.182359,0.004746,0.156865,1.118062,0.165113,-0.027945,0.264322,-0.270489,-0.787971,0.543165,...,0.006624,0.117942,0.49828,0.0,0.192875,0.121621,0.217429,0.196729,0.264837,0.030642
9,0.302647,-0.517408,0.650427,1.118062,0.165113,-0.491768,-0.974725,-0.270489,-0.787971,-0.21222,...,0.0,0.306877,0.397395,0.001631,0.15065,0.125453,0.216075,0.133207,0.149556,0.000503


In [32]:
modelName = 'LGB'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.8125,0.835294,0.8,0.8,0.8,0.623529,0.623529,0.846924
1,0.6875,0.74902,0.6,0.692308,0.642857,0.367589,0.370565,0.748795
2,0.75,0.827451,0.666667,0.769231,0.714286,0.494071,0.498071,0.828248
3,0.84375,0.85098,0.8,0.857143,0.827586,0.685039,0.686403,0.862157
4,0.78125,0.815686,0.666667,0.833333,0.740741,0.555556,0.565916,0.811835
5,0.6875,0.768627,0.666667,0.666667,0.666667,0.372549,0.372549,0.71895
6,0.8125,0.858824,0.733333,0.846154,0.785714,0.620553,0.625577,0.881905
7,0.65625,0.733333,0.6,0.642857,0.62069,0.307087,0.307698,0.750679
8,0.78125,0.784314,0.6,0.9,0.72,0.552,0.582636,0.785589
9,0.65625,0.745098,0.6,0.642857,0.62069,0.307087,0.307698,0.652116


In [33]:
modelName = 'xgb'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.6875,0.760784,0.6,0.692308,0.642857,0.367589,0.370565,0.726118
1,0.65625,0.74902,0.4,0.75,0.521739,0.290323,0.325396,0.738831
2,0.75,0.835294,0.6,0.818182,0.692308,0.49004,0.506791,0.69857
3,0.6875,0.847059,0.4,0.857143,0.545455,0.352227,0.411842,0.829689
4,0.78125,0.811765,0.733333,0.785714,0.758621,0.559055,0.560168,0.809086
5,0.84375,0.882353,0.8,0.857143,0.827586,0.685039,0.686403,0.832833
6,0.75,0.788235,0.6,0.818182,0.692308,0.49004,0.506791,0.793778
7,0.71875,0.8,0.666667,0.714286,0.689655,0.433071,0.433933,0.811223
8,0.71875,0.764706,0.6,0.75,0.666667,0.428571,0.436564,0.785384
9,0.59375,0.658824,0.466667,0.583333,0.518519,0.174603,0.177859,0.641171


In [34]:
modelName = 'cat'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.65625,0.745098,0.6,0.642857,0.62069,0.307087,0.307698,0.775751
1,0.71875,0.768627,0.533333,0.8,0.64,0.424,0.447532,0.72875
2,0.59375,0.658824,0.466667,0.583333,0.518519,0.174603,0.177859,0.59456
3,0.65625,0.721569,0.533333,0.666667,0.592593,0.301587,0.307212,0.742001
4,0.65625,0.792157,0.466667,0.7,0.56,0.296,0.312428,0.751693
5,0.6875,0.737255,0.466667,0.777778,0.583333,0.35743,0.387378,0.732134
6,0.75,0.792157,0.6,0.818182,0.692308,0.49004,0.506791,0.689668
7,0.71875,0.803922,0.533333,0.8,0.64,0.424,0.447532,0.697244
8,0.625,0.690196,0.533333,0.615385,0.571429,0.241107,0.243059,0.650115
9,0.4375,0.498039,0.2,0.333333,0.25,-0.156627,-0.16975,0.497443


In [None]:
modelName = 'ext'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

In [None]:
modelName = 'gbc'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

In [None]:
modelName = 'mlp'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

In [None]:
modelName = 'rf'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

In [36]:
modelName = 'svm'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.78125,0.854902,0.933333,0.7,0.8,0.569231,0.598255,0.858332
1,0.78125,0.756863,0.8,0.75,0.774194,0.5625,0.563602,0.781964
2,0.8125,0.768627,0.8,0.8,0.8,0.623529,0.623529,0.805945
3,0.78125,0.796078,0.866667,0.722222,0.787879,0.565891,0.575947,0.826853
4,0.8125,0.8,0.866667,0.764706,0.8125,0.626459,0.631373,0.829603
5,0.8125,0.823529,0.866667,0.764706,0.8125,0.626459,0.631373,0.857567
6,0.875,0.905882,0.933333,0.823529,0.875,0.750973,0.756863,0.926166
7,0.78125,0.780392,0.8,0.75,0.774194,0.5625,0.563602,0.801561
8,0.78125,0.803922,0.8,0.75,0.774194,0.5625,0.563602,0.846362
9,0.75,0.764706,0.8,0.705882,0.75,0.501946,0.505882,0.767838
