In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.metrics import roc_curve,roc_auc_score, confusion_matrix, precision_recall_curve, auc, mean_squared_error, \
    r2_score, mean_absolute_error,cohen_kappa_score,accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score

In [4]:
def statistical(y_true, y_pred, y_pro):
    c_mat = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = list(c_mat.flatten())
    se = tp / (tp + fn)
    sp = tn / (tn + fp)
    auc_prc = auc(precision_recall_curve(y_true, y_pro, pos_label=1)[1],
                  precision_recall_curve(y_true, y_pro, pos_label=1)[0])
    acc = (tp + tn) / (tn + fp + fn + tp)
#     acc_skl = accuracy_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pro)
    recall = se
#     recall_skl = recall_score(y_true, y_pred)
    precision = tp / (tp + fp)
#     precision_skl = precision_score(y_true, y_pred)
    f1 = 2 * (precision * recall) / (precision + recall) # F1 = 2 * (precision * recall) / (precision + recall)
#     f1_skl = f1_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true,y_pred)
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) + 1e-8)
#     mcc_skl = matthews_corrcoef(y_true,y_pred)

    scores_dict = {}
    scores_dict['acc'] = acc
    scores_dict['auc_roc'] = auc_roc
    scores_dict['recall'] = recall
    scores_dict['precision'] = precision
    scores_dict['f1'] = f1
    scores_dict['kappa'] = kappa
    scores_dict['mcc'] = mcc 
    scores_dict['auc_prc'] = auc_prc
    import collections
    scores_dict = collections.OrderedDict(scores_dict)
    
    return scores_dict

In [5]:
def GetPreditcTable(preDate,tureData,name):
    repetitions = 10
    predict_data =[] 
    pre_scores = []
    smiles = tureData['smiles'].to_list()
    y_true = tureData['label'].to_list()
    y_pro_avg = preDate['label']
    y_pred_avg = [1 if p > 0.5 else 0 for p in y_pro_avg]  
    avg_scores_dict = statistical(y_true, y_pred_avg, y_pro_avg)
    avg_scores_dict = pd.DataFrame(avg_scores_dict,index=['avg'])
    for i in range(repetitions):
        y_pro = preDate['label_model_'+str(i)]
        y_pred = [1 if p > 0.5 else 0 for p in y_pro]
        predict_data.append([smiles, y_true, y_pred])
        scores_dict = statistical(y_true, y_pred, y_pro)
        pre_scores.append(scores_dict)
   
    predict_data.append(['Average results of 10 models', y_true, y_pred_avg])
    cols = ['smiles','Test', 'Individual_Predict']
    result = pd.DataFrame(predict_data, columns=cols)
    result.to_csv('Predict/'+name+'_Cal_Metric_Data_bak.csv',index = False)
    
    data_df = pd.DataFrame(pre_scores)
    data_df.loc['Mean'] = data_df.mean()
    data_df.loc['Std'] = data_df.std()
    data_df = pd.concat([data_df, avg_scores_dict], axis=0)
    data_df.to_csv('Predict/'+name+'_Cal_Metric_Summarize.csv',index = False)
    return data_df

In [6]:
resultFileList = ['AR_6108','AR_6108_Rdkit2d','AR_6108_Alva','AR_6108_FP1024','AR_6108_Mordred','AR_6108_Padel']
tureData = pd.read_csv('6_pre_32_AR_剔除NURA重复数据.csv')

In [5]:
name = resultFileList[0]
print(name)
preDate = pd.read_csv('Predict/'+name+'_Predict_Result.csv')
resultContainsNoDescriptors = GetPreditcTable(preDate,tureData,name)
resultContainsNoDescriptors

AR_6108


Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.59375,0.690196,0.666667,0.555556,0.606061,0.193798,0.197242,0.687004
1,0.625,0.745098,0.266667,0.8,0.4,0.216327,0.285653,0.77511
2,0.71875,0.8,0.466667,0.875,0.608696,0.419355,0.470016,0.789096
3,0.53125,0.6,0.333333,0.5,0.4,0.04,0.04222,0.50761
4,0.625,0.760784,0.8,0.571429,0.666667,0.264368,0.284297,0.775346
5,0.65625,0.682353,0.666667,0.625,0.645161,0.3125,0.313112,0.693872
6,0.6875,0.807843,0.866667,0.619048,0.722222,0.386973,0.416146,0.810166
7,0.65625,0.713725,0.666667,0.625,0.645161,0.3125,0.313112,0.59492
8,0.65625,0.784314,0.933333,0.583333,0.717949,0.333333,0.397706,0.718863
9,0.53125,0.65098,0.133333,0.5,0.210526,0.016393,0.023669,0.544478


In [6]:
name = resultFileList[1]
print(name)
preDate = pd.read_csv('Predict/'+name+'_Predict_Result.csv')
resultContainsNoDescriptors = GetPreditcTable(preDate,tureData,name)
resultContainsNoDescriptors

AR_6108_Rdkit2d


Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.5625,0.743137,0.733333,0.52381,0.611111,0.141762,0.152449,0.757516
1,0.5625,0.658824,0.533333,0.533333,0.533333,0.121569,0.121569,0.718893
2,0.6875,0.811765,1.0,0.6,0.75,0.396226,0.49705,0.786251
3,0.6875,0.796078,0.733333,0.647059,0.6875,0.377432,0.380392,0.814095
4,0.6875,0.843137,0.8,0.631579,0.705882,0.382239,0.394472,0.867909
5,0.5625,0.701961,0.733333,0.52381,0.611111,0.141762,0.152449,0.705267
6,0.65625,0.858824,0.8,0.6,0.685714,0.323077,0.33955,0.872069
7,0.5625,0.521569,0.466667,0.538462,0.5,0.114625,0.115553,0.462427
8,0.625,0.835294,0.933333,0.56,0.7,0.275472,0.345568,0.826295
9,0.75,0.807843,0.933333,0.666667,0.777778,0.509579,0.547994,0.654091


In [7]:
name = resultFileList[2]
print(name)
preDate = pd.read_csv('Predict/'+name+'_Predict_Result.csv')
resultContainsNoDescriptors = GetPreditcTable(preDate,tureData,name)
resultContainsNoDescriptors

AR_6108_Alva


Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.78125,0.827451,0.933333,0.7,0.8,0.569231,0.598255,0.79039
1,0.75,0.835294,0.8,0.705882,0.75,0.501946,0.505882,0.826847
2,0.625,0.643137,0.6,0.6,0.6,0.247059,0.247059,0.625338
3,0.75,0.739216,0.8,0.705882,0.75,0.501946,0.505882,0.717204
4,0.6875,0.701961,0.6,0.692308,0.642857,0.367589,0.370565,0.707484
5,0.59375,0.707843,0.666667,0.555556,0.606061,0.193798,0.197242,0.70301
6,0.6875,0.764706,0.666667,0.666667,0.666667,0.372549,0.372549,0.764008
7,0.625,0.709804,0.6,0.6,0.6,0.247059,0.247059,0.657996
8,0.71875,0.75098,0.8,0.666667,0.727273,0.44186,0.449712,0.718116
9,0.71875,0.8,0.8,0.666667,0.727273,0.44186,0.449712,0.79443


In [8]:
name = resultFileList[3]
print(name)
preDate = pd.read_csv('Predict/'+name+'_Predict_Result.csv')
resultContainsNoDescriptors = GetPreditcTable(preDate,tureData,name)
resultContainsNoDescriptors

AR_6108_FP1024


Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.53125,0.54902,0.6,0.5,0.545455,0.069767,0.071007,0.460032
1,0.625,0.796078,0.8,0.571429,0.666667,0.264368,0.284297,0.796853
2,0.5625,0.584314,0.733333,0.52381,0.611111,0.141762,0.152449,0.482344
3,0.625,0.615686,0.466667,0.636364,0.538462,0.23506,0.243095,0.541371
4,0.65625,0.7,0.8,0.6,0.685714,0.323077,0.33955,0.646379
5,0.40625,0.380392,0.333333,0.357143,0.344828,-0.19685,-0.197242,0.430934
6,0.59375,0.594118,0.8,0.545455,0.648649,0.206107,0.227988,0.562617
7,0.53125,0.631373,0.066667,0.5,0.117647,0.008264,0.016169,0.525529
8,0.46875,0.517647,0.733333,0.458333,0.564103,-0.030303,-0.036155,0.445495
9,0.5625,0.4,0.2,0.6,0.3,0.085714,0.113183,0.403246


In [9]:
name = resultFileList[4]
print(name)
preDate = pd.read_csv('Predict/'+name+'_Predict_Result.csv')
resultContainsNoDescriptors = GetPreditcTable(preDate,tureData,name)
resultContainsNoDescriptors

AR_6108_Mordred


Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.75,0.686275,0.866667,0.684211,0.764706,0.505792,0.521979,0.530172
1,0.65625,0.811765,0.8,0.6,0.685714,0.323077,0.33955,0.698655
2,0.59375,0.654902,0.466667,0.583333,0.518519,0.174603,0.177859,0.542317
3,0.71875,0.698039,0.733333,0.6875,0.709677,0.4375,0.438357,0.549393
4,0.8125,0.870588,0.933333,0.736842,0.823529,0.629344,0.649485,0.753359
5,0.625,0.65098,0.4,0.666667,0.5,0.228916,0.248096,0.549673
6,0.53125,0.596078,0.8,0.5,0.615385,0.090909,0.108465,0.492187
7,0.71875,0.866667,0.866667,0.65,0.742857,0.446154,0.468902,0.881183
8,0.75,0.823529,0.866667,0.684211,0.764706,0.505792,0.521979,0.837136
9,0.46875,0.470588,0.533333,0.444444,0.484848,-0.054264,-0.055228,0.412701


In [10]:
name = resultFileList[5]
print(name)
preDate = pd.read_csv('Predict/'+name+'_Predict_Result.csv')
resultContainsNoDescriptors = GetPreditcTable(preDate,tureData,name)
resultContainsNoDescriptors

AR_6108_Padel


Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.65625,0.72549,0.866667,0.590909,0.702703,0.328244,0.363092,0.577639
1,0.625,0.662745,0.533333,0.615385,0.571429,0.241107,0.243059,0.53376
2,0.6875,0.678431,0.866667,0.619048,0.722222,0.386973,0.416146,0.61323
3,0.6875,0.627451,0.8,0.631579,0.705882,0.382239,0.394472,0.488071
4,0.75,0.878431,0.866667,0.684211,0.764706,0.505792,0.521979,0.87936
5,0.59375,0.541176,0.8,0.545455,0.648649,0.206107,0.227988,0.449016
6,0.5625,0.631373,0.4,0.545455,0.461538,0.10757,0.111247,0.503414
7,0.5625,0.65098,0.466667,0.538462,0.5,0.114625,0.115553,0.529827
8,0.625,0.792157,0.8,0.571429,0.666667,0.264368,0.284297,0.772829
9,0.65625,0.690196,0.6,0.642857,0.62069,0.307087,0.307698,0.635302


In [8]:
name = 'AR_6108_MACC'
print(name)
preDate = pd.read_csv('Predict/'+name+'_Predict_Result.csv')
resultContainsNoDescriptors = GetPreditcTable(preDate,tureData,name)
resultContainsNoDescriptors

AR_6108_MACC


Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.59375,0.717647,0.866667,0.541667,0.666667,0.212121,0.253086,0.714921
1,0.78125,0.839216,0.733333,0.785714,0.758621,0.559055,0.560168,0.834721
2,0.71875,0.729412,0.6,0.75,0.666667,0.428571,0.436564,0.772513
3,0.625,0.72549,0.666667,0.588235,0.625,0.252918,0.254902,0.620515
4,0.65625,0.8,0.6,0.642857,0.62069,0.307087,0.307698,0.831764
5,0.65625,0.65098,0.666667,0.625,0.645161,0.3125,0.313112,0.615823
6,0.59375,0.756863,0.733333,0.55,0.628571,0.2,0.210198,0.65578
7,0.625,0.643137,0.6,0.6,0.6,0.247059,0.247059,0.548458
8,0.625,0.717647,0.733333,0.578947,0.647059,0.258687,0.266966,0.602486
9,0.65625,0.713725,0.666667,0.625,0.645161,0.3125,0.313112,0.588785


In [9]:
name = 'AR_6108_PubChem'
print(name)
preDate = pd.read_csv('Predict/'+name+'_Predict_Result.csv')
resultContainsNoDescriptors = GetPreditcTable(preDate,tureData,name)
resultContainsNoDescriptors

AR_6108_PubChem


Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.59375,0.8,0.133333,1.0,0.235294,0.140496,0.274874,0.789398
1,0.75,0.803922,0.8,0.705882,0.75,0.501946,0.505882,0.692518
2,0.65625,0.756863,0.666667,0.625,0.645161,0.3125,0.313112,0.791674
3,0.59375,0.72549,0.266667,0.666667,0.380952,0.154472,0.190525,0.588196
4,0.65625,0.827451,0.8,0.6,0.685714,0.323077,0.33955,0.85131
5,0.59375,0.729412,0.6,0.5625,0.580645,0.1875,0.187867,0.735009
6,0.6875,0.729412,0.733333,0.647059,0.6875,0.377432,0.380392,0.755159
7,0.59375,0.796078,0.2,0.75,0.315789,0.147541,0.213021,0.775923
8,0.4375,0.568627,0.2,0.333333,0.25,-0.156627,-0.16975,0.477359
9,0.65625,0.721569,0.466667,0.7,0.56,0.296,0.312428,0.598682
