In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd, numpy as np

In [6]:
X = pd.read_csv("data/trainMatrix.csv")
Y = pd.read_csv("data/curedData.csv", usecols=["label"]).squeeze()

## Stratified test splitting to obtain 80% and 20% of total chemicals

In [7]:
nSplit=100

## CV on 80% and predictions on 20%

In [8]:
dataSplit = pd.read_csv("data/splitData.csv", index_col=0)
dataSplit.columns = dataSplit.columns.astype(int)

In [10]:
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
import random

from consensusModel import clsf

In [None]:
allScores=pd.DataFrame(columns=pd.MultiIndex.from_product((range(nSplit), clsf.keys())),
                       index=range(len(X))
                      )
folds = 5

for cv in tqdm(range(nSplit)):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random.seed(cv) )
    thisSplit = dataSplit[cv]
    Xsplit = X.loc[thisSplit.groupby(thisSplit).indices["train"]]
    testSplit = X.loc[thisSplit.groupby(thisSplit).indices["test"]]
    Ysplit = Y.loc[Xsplit.index]
    
    for cName,cObj in clsf.items():
        # fitting on 80% stratified partition
        cObj.fit(Xsplit.values,Ysplit.values)
        # getting score from prediction on 20% stratified partition
        prob=cObj.predict_proba(testSplit.values)[:,1]

        allScores.loc[testSplit.index,(cv,cName)]=prob
    
    
    for train, val in skf.split(Xsplit, Ysplit):
        idxTrain = Xsplit.iloc[train].index
        idxVal = Xsplit.iloc[val].index
        
        Ytrain=Y[idxTrain]

        Xtrain=X.loc[idxTrain].values
        Xval=X.loc[idxVal].values

        for cName,cObj in clsf.items():
            cObj.fit(Xtrain,Ytrain)
            prob=cObj.predict_proba(Xval)[:,1]
            
            allScores.loc[idxVal,(cv,cName)]=prob

 49%|████████████████████████████████████████▏                                         | 49/100 [04:47<05:00,  5.90s/it]

## Results

In [15]:
allScores = pd.read_csv("results/CVcutoffScores.csv", header=[0,1], index_col=0)
allScores.columns = allScores.columns.set_levels([
allScores.columns.levels[0].astype(int), allScores.columns.levels[0]
])

In [13]:
allScores

Unnamed: 0_level_0,0,0,0,0,0,1,1,1,1,1,...,98,98,98,98,98,99,99,99,99,99
Unnamed: 0_level_1,RF,SVM,XGB,KNN,ADA,RF,SVM,XGB,KNN,ADA,...,RF,SVM,XGB,KNN,ADA,RF,SVM,XGB,KNN,ADA
molID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.910000,0.855687,0.915959,1.0,0.511818,0.876667,0.883905,0.959654,0.8,0.511754,...,0.936667,0.860430,0.937577,1.0,0.510955,0.930000,0.912035,0.973442,0.8,0.513061
1,0.980000,0.835905,0.893418,0.6,0.677356,0.950000,0.839239,0.913346,0.8,0.508789,...,0.940000,0.711503,0.824940,0.8,0.503309,1.000000,0.855524,0.954617,1.0,0.515953
2,0.960000,0.876072,0.938534,1.0,0.683530,0.960000,0.854062,0.849119,1.0,0.508789,...,0.707333,0.894285,0.951358,1.0,0.511358,0.929476,0.918388,0.849749,1.0,0.509565
3,0.960000,0.876072,0.938534,1.0,0.683530,0.852500,0.843251,0.836675,1.0,0.507966,...,0.907500,0.925880,0.917339,1.0,0.510955,0.970000,0.944325,0.944313,1.0,0.513068
4,0.535381,0.832700,0.504662,0.8,0.507710,0.603333,0.697057,0.563109,0.6,0.501805,...,0.940000,0.711503,0.824940,0.8,0.503309,0.270000,0.780102,0.356402,0.6,0.504691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,0.940000,0.884692,0.929047,1.0,0.512133,0.780000,0.851080,0.921666,1.0,0.507396,...,0.845000,0.873185,0.957845,1.0,0.510955,0.902667,0.878159,0.807462,1.0,0.508182
729,0.960000,0.924649,0.914517,1.0,0.511818,0.900000,0.849802,0.818534,1.0,0.508789,...,1.000000,0.868934,0.939386,1.0,0.507878,0.986667,0.919477,0.931663,0.8,0.513068
730,0.860000,0.766468,0.383223,1.0,0.498308,0.820000,0.744961,0.699710,0.8,0.496251,...,0.870000,0.745298,0.759205,1.0,0.501215,0.823333,0.654883,0.511420,1.0,0.498709
731,0.710000,0.663452,0.234558,1.0,0.497210,0.600000,0.622217,0.316278,0.6,0.496258,...,0.850000,0.761151,0.467750,1.0,0.496827,0.790000,0.743218,0.383273,0.8,0.499824


In [16]:
consensusScores =pd.concat( 
            [allScores[ n ].mean(axis=1) for n in range(nSplit)], 
            axis=1, names = range(nSplit)
                            )

In [32]:
from utilsMetric import metrics, getTableFromStats

In [19]:
steps = np.arange(0.3,0.81,0.1)

metricsSplitCV=pd.DataFrame( columns=pd.MultiIndex.from_product( (range(nSplit), steps) ),
                           index=metrics.keys())

# Storing performance from CV prediction on each cutoff classification score
for metrName, metrFun in metrics.items():
    metricsSplitCV.loc[metrName] = [ 
                                    metrFun(Y[mask],pd.Series(scores)[mask].apply(lambda x: 1 if x>=cutoff else 0), scores[mask])
                                    for mask,scores in zip( (dataSplit=="train").values.T, consensusScores.values.T)  
                                    for cutoff in steps
                                    ]

In [20]:
metricsSplitCV

Unnamed: 0_level_0,0,0,0,0,0,0,1,1,1,1,...,98,98,98,98,99,99,99,99,99,99
Unnamed: 0_level_1,0.3,0.4,0.5,0.6,0.7,0.8,0.3,0.4,0.5,0.6,...,0.5,0.6,0.7,0.8,0.3,0.4,0.5,0.6,0.7,0.8
spec,0.151515,0.20202,0.232323,0.30303,0.40404,0.565657,0.111111,0.222222,0.242424,0.313131,...,0.272727,0.353535,0.444444,0.656566,0.191919,0.252525,0.282828,0.353535,0.434343,0.626263
sens,1.0,1.0,0.98768,0.956879,0.89117,0.685832,0.997947,0.995893,0.975359,0.940452,...,0.973306,0.940452,0.856263,0.718686,1.0,0.995893,0.983573,0.956879,0.907598,0.718686
balacc,0.575758,0.60101,0.610001,0.629955,0.647605,0.625744,0.554529,0.609058,0.608892,0.626792,...,0.623017,0.646994,0.650354,0.687626,0.59596,0.624209,0.633201,0.655207,0.67097,0.672474
mcc,0.35948,0.416921,0.380088,0.345495,0.302726,0.196723,0.288531,0.412377,0.33984,0.313425,...,0.365552,0.353171,0.284644,0.295844,0.406006,0.443995,0.415705,0.395632,0.358655,0.272932
ppv,0.85289,0.860424,0.863555,0.871028,0.880325,0.885942,0.84669,0.862989,0.863636,0.870722,...,0.868132,0.877395,0.883475,0.911458,0.858907,0.867621,0.870909,0.879245,0.88755,0.904393
npv,1.0,1.0,0.793103,0.588235,0.430108,0.267943,0.916667,0.916667,0.666667,0.516667,...,0.675,0.546875,0.385965,0.321782,1.0,0.925926,0.777778,0.625,0.488636,0.311558
PRcurve,0.893144,0.893144,0.893144,0.893144,0.893144,0.893144,0.925259,0.925259,0.925259,0.925259,...,0.920316,0.920316,0.920316,0.920316,0.909442,0.909442,0.909442,0.909442,0.909442,0.909442
ROCcurve,0.697115,0.697115,0.697115,0.697115,0.697115,0.697115,0.741688,0.741688,0.741688,0.741688,...,0.744934,0.744934,0.744934,0.744934,0.754423,0.754423,0.754423,0.754423,0.754423,0.754423


In [21]:
# Retrieving cutoff value maximizing mcc for each run of CV
mccMax = pd.Series([  metricsSplitCV.loc["mcc",n].astype(float).idxmax()
                      for n in range(nSplit) ])

In [27]:
mccMax.describe()

count    100.000000
mean       0.460000
std        0.093203
min        0.300000
25%        0.400000
50%        0.500000
75%        0.500000
max        0.700000
dtype: float64

In [23]:
metricsSplitTest=pd.DataFrame( columns=range(nSplit), index=metrics.keys())

# storing performance setting cutoff value achieved from relative CV run
for metrName, metrFun in metrics.items():
    metricsSplitTest.loc[metrName] = [ 
                                    metrFun(Y[mask],pd.Series(scores)[mask].apply(lambda x: 1 if x>=cutoff else 0), scores[mask])
                                    for mask,cutoff,scores  in zip( (dataSplit=="test").values.T, mccMax, consensusScores.values.T)  
                                    ]

In [30]:
metricsSplitTest

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
spec,0.28,0.28,0.28,0.24,0.12,0.16,0.24,0.32,0.36,0.32,...,0.16,0.2,0.24,0.2,0.28,0.28,0.16,0.16,0.28,0.16
sens,0.991803,0.991803,0.983607,0.991803,0.967213,1.0,0.97541,0.97541,0.983607,0.967213,...,1.0,0.983607,0.991803,1.0,0.991803,0.991803,1.0,0.991803,0.983607,0.991803
balacc,0.635902,0.635902,0.631803,0.615902,0.543607,0.58,0.607705,0.647705,0.671803,0.643607,...,0.58,0.591803,0.615902,0.6,0.635902,0.635902,0.58,0.575902,0.631803,0.575902
mcc,0.450145,0.450145,0.413091,0.408937,0.153858,0.369464,0.337563,0.421802,0.49062,0.394091,...,0.369464,0.323911,0.408937,0.414525,0.450145,0.450145,0.369464,0.314631,0.413091,0.314631
ppv,0.870504,0.870504,0.869565,0.864286,0.842857,0.853147,0.862319,0.875,0.882353,0.874074,...,0.853147,0.857143,0.864286,0.859155,0.870504,0.870504,0.853147,0.852113,0.869565,0.852113
npv,0.875,0.875,0.777778,0.857143,0.428571,1.0,0.666667,0.727273,0.818182,0.666667,...,1.0,0.714286,0.857143,1.0,0.875,0.875,1.0,0.8,0.777778,0.8
PRcurve,0.942818,0.950367,0.880025,0.952207,0.933048,0.88845,0.901696,0.930858,0.893673,0.914087,...,0.926329,0.90091,0.943325,0.92262,0.94907,0.913047,0.895911,0.915904,0.866367,0.859639
ROCcurve,0.802131,0.803279,0.710164,0.803115,0.742459,0.73082,0.707541,0.773443,0.731148,0.736721,...,0.751475,0.672459,0.812787,0.733443,0.809836,0.775082,0.681803,0.733279,0.648361,0.606885


In [31]:
metricsSplitTest.astype(float).T.describe()

Unnamed: 0,spec,sens,balacc,mcc,ppv,npv,PRcurve,ROCcurve
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.2604,0.980984,0.620692,0.390705,0.866513,0.7982,0.913025,0.737302
std,0.096734,0.024127,0.04424,0.099346,0.014245,0.182855,0.024335,0.054704
min,0.04,0.885246,0.52,0.153858,0.835616,0.428571,0.85199,0.560492
25%,0.2,0.97541,0.59041,0.328023,0.857143,0.666667,0.895362,0.709549
50%,0.24,0.991803,0.617705,0.381754,0.864767,0.833333,0.916226,0.740246
75%,0.32,1.0,0.643115,0.453676,0.874306,1.0,0.929285,0.775082
max,0.52,1.0,0.72,0.628256,0.9,1.0,0.956299,0.843934
