# `11-validate-models`

In [1]:
import pandas, numpy, pathlib

from skops.io import load

from sklearn.utils import resample

import matplotlib.pyplot as plt

from misc import construct_line

from statsmodels.stats.weightstats import ztest as ztest

number_of_bootstraps = 10

### Model validation on `validation` and `mic` sets

As discussed in the manuscript we have two further datasets (`validation` & `mic`) to evaluate the models on.

First let's load the trained models, the datasets and the results for Suspect-PZA


In [2]:
best_model = {}

for model in ['LR', 'NN', 'XB']:
    best_model[model] = load('models/'+model.lower()+'.skops', trusted=True)

X={}
Y={}
Z={}

for i in ['validation-samples', 'validation-mutations', 'mic']:
    X[i]={}
    Y[i]={}
    Z[i]={}
    with open('data/ds-'+i+'.npy', 'rb') as f:
        Y[i]['input'] = numpy.load(f)
        X[i]['input'] = numpy.load(f)
        Z[i]['input'] = numpy.load(f, allow_pickle=True)

# load in the results for SuspectPZA
suspectpza={}
for i in ['validation-samples', 'validation-mutations','mic']:
    suspectpza[i]={}
    with open('data/suspectpza-'+i+'.npy', 'rb') as f:
        suspectpza[i]['input'] = numpy.load(f)
        suspectpza[i]['predicted'] = numpy.load(f)
        suspectpza[i]['muts'] = numpy.load(f, allow_pickle=True)
line = []

For comparison later, let's make a subset of `validation-samples` that exclude those mutations which have an inconsistent phenotype, either because too few are in the dataset or they do not consistently test as R or S, suggesting they may be near the breakpoint/ECOFF. 

We need to do this for both the `validation-samples` features and the results from the published SuspectPZA model.

In [3]:
df = pandas.read_csv('data/ds-validation-mutations-full.csv')

mask = ~numpy.isin(Z['validation-samples']['input'], df[df.CONSISTENT_PHENOTYPE=='U'].MUTATION.unique())

X['validation-samplesnoU']={}
Y['validation-samplesnoU']={}
Z['validation-samplesnoU']={}

X['validation-samplesnoU']['input']=X['validation-samples']['input'][mask]
Y['validation-samplesnoU']['input']=Y['validation-samples']['input'][mask]
Z['validation-samplesnoU']['input']=Z['validation-samples']['input'][mask]

mask = ~numpy.isin(suspectpza['validation-samples']['muts'],(df[df.CONSISTENT_PHENOTYPE=='U'].MUTATION.unique()))
suspectpza['validation-samplesnoU']={}
suspectpza['validation-samplesnoU']['input']=suspectpza['validation-samples']['input'][mask]
suspectpza['validation-samplesnoU']['predicted']=suspectpza['validation-samples']['predicted'][mask]
suspectpza['validation-samplesnoU']['muts']=suspectpza['validation-samples']['muts'][mask]

In [4]:
df[df.CONSISTENT_PHENOTYPE!='U'].MUTATION.unique()

array(['A102R', 'A102T', 'A134V', 'A143G', 'A143T', 'A143V', 'A146E',
       'A146P', 'A146T', 'A146V', 'A171V', 'A30V', 'A3E', 'A46E', 'A46T',
       'A79T', 'A79V', 'C138R', 'C14R', 'C72R', 'C72Y', 'D129Y', 'D12A',
       'D12E', 'D12N', 'D136Y', 'D49A', 'D49G', 'D49N', 'D63A', 'D8A',
       'D8E', 'D8G', 'D8N', 'E15G', 'F106S', 'F13I', 'F13V', 'F58V',
       'F81C', 'F81S', 'F94C', 'F94S', 'G105D', 'G105V', 'G108R', 'G124D',
       'G132A', 'G132C', 'G132D', 'G132S', 'G162D', 'G17D', 'G17S',
       'G24D', 'G97C', 'G97D', 'G97R', 'G97S', 'G97V', 'H137P', 'H137Q',
       'H43P', 'H43Y', 'H51D', 'H51P', 'H51Q', 'H51R', 'H51Y', 'H57D',
       'H57L', 'H57P', 'H57Q', 'H57R', 'H57Y', 'H71P', 'H71Q', 'H71Y',
       'H82D', 'H82R', 'I31S', 'I5S', 'I5T', 'I6L', 'I6S', 'I6T', 'I6V',
       'I90T', 'K48N', 'K96E', 'K96N', 'K96R', 'K96T', 'L116R', 'L120P',
       'L120Q', 'L120R', 'L151S', 'L156P', 'L159R', 'L172P', 'L172R',
       'L182F', 'L19P', 'L27P', 'L35R', 'L4S', 'L85P', 'L85R', 'M175I

The below function takes a supplied model, applies it to the features of the different datasets and measures a range of metrics we can use to evaluate their performance.

Note that this uses the separate `construct_line` function which can be found in `misc.py`.

In [5]:
def validate_model(line, best_model, model_name, X, Y):

    for dataset in ['validation-samples', 'validation-samplesnoU', 'validation-mutations', 'mic']: 
        
        Y[dataset]['predicted'] = best_model.predict(X[dataset]['input'])
        Y[dataset]['scores'] = best_model.predict_proba(X[dataset]['input'])[:,1]

        row = construct_line(model_name , dataset, None, Y[dataset], None)
        line.append(row)

    return(line)

In [6]:
for model in ['NN', 'XB', 'LR']:
    line = validate_model(line, best_model[model], model, X, Y)

In [7]:
for i in ['validation-samples', 'validation-samplesnoU', 'validation-mutations', 'mic']:
    line.append(construct_line('SP', i, None, suspectpza[i], None))

In [8]:
def bootstrap_model(line, best_model, model_name, X, Y):
    x={}
    y={}
    for dataset in ['validation-samples', 'validation-samplesnoU', 'validation-mutations', 'mic']:
        print(model_name, dataset)
        for i in range(number_of_bootstraps):
            x['input'],y['input'] = resample(X[dataset]['input'], Y[dataset]['input'], random_state=42+i)
            y['predicted'] = best_model.predict(x['input'])
            y['scores'] = best_model.predict_proba(x['input'])[:,1]
            row = construct_line(model_name, dataset+'_'+str(i), None, y, None)
            line.append(row)
    return(line)

In [9]:
for model in ['LR', 'NN', 'XB']:
    line = bootstrap_model(line, best_model[model], model, X, Y)

LR validation-samples
LR validation-samplesnoU
LR validation-mutations
LR mic
NN validation-samples
NN validation-samplesnoU
NN validation-mutations
NN mic
XB validation-samples
XB validation-samplesnoU
XB validation-mutations
XB mic


In [10]:
test_results = pandas.DataFrame(line, columns=['model', 'dataset', 'sensitivity_mean', 'sensitivity_std', 'specificity_mean', 'specificity_std' ,'roc_auc_mean', 'roc_auc_std','TN','FP','FN','TP', 'model_parameters'])
test_results[:3]

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters
0,NN,validation-samples,90.517241,,51.649236,,73.731156,,642,601,264,2520,
1,NN,validation-samplesnoU,91.479239,,66.666667,,80.821306,,538,269,197,2115,
2,NN,validation-mutations,94.193548,,52.272727,,77.565982,,23,21,9,146,


In [11]:
line = []
for i in ['validation-samples', 'validation-samplesnoU', 'validation-mutations', 'mic']:
    for model in ['LR', 'NN', 'XB']:
        sens_mean = test_results[(test_results.model==model) & (test_results.dataset!=i) & (test_results.dataset.str.contains(i+"_"))].sensitivity_mean.mean()
        sens_std = 1.96*test_results[(test_results.model==model) & (test_results.dataset!=i) & (test_results.dataset.str.contains(i+"_"))].sensitivity_mean.std()/(number_of_bootstraps**0.5)
        spec_mean = test_results[(test_results.model==model) & (test_results.dataset!=i) & (test_results.dataset.str.contains(i+"_"))].specificity_mean.mean()
        spec_std = 1.96*test_results[(test_results.model==model) & (test_results.dataset!=i) & (test_results.dataset.str.contains(i+"_"))].specificity_mean.std()/(number_of_bootstraps**0.5)
        roc_mean = test_results[(test_results.model==model) & (test_results.dataset!=i) & (test_results.dataset.str.contains(i+"_"))].roc_auc_mean.mean()
        row = [model, 'bootstrapped-'+i, sens_mean, sens_std, spec_mean, spec_std, roc_mean, None, None, None, None, None, None ]
        line.append(row)

extra_rows = pandas.DataFrame(line, columns=['model', 'dataset', 'sensitivity_mean', 'sensitivity_std', 'specificity_mean', 'specificity_std' ,'roc_auc_mean', 'roc_auc_std','TN','FP','FN','TP', 'model_parameters'])
test_results  = pandas.concat([test_results, extra_rows])
test_results[:3]

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters
0,NN,validation-samples,90.517241,,51.649236,,73.731156,,642,601,264,2520,
1,NN,validation-samplesnoU,91.479239,,66.666667,,80.821306,,538,269,197,2115,
2,NN,validation-mutations,94.193548,,52.272727,,77.565982,,23,21,9,146,


In [12]:
extra_rows


Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters
0,LR,bootstrapped-validation-samples,98.028237,0.159763,40.656509,0.858797,79.678077,,,,,,
1,NN,bootstrapped-validation-samples,90.320452,0.376692,52.127855,0.802745,73.72043,,,,,,
2,XB,bootstrapped-validation-samples,97.198569,0.177182,46.299291,1.013533,80.7848,,,,,,
3,LR,bootstrapped-validation-samplesnoU,98.518352,0.159959,56.051718,1.120422,84.898344,,,,,,
4,NN,bootstrapped-validation-samplesnoU,91.512061,0.370772,66.653695,1.441935,80.977787,,,,,,
5,XB,bootstrapped-validation-samplesnoU,97.706924,0.213195,63.388164,1.085203,86.908148,,,,,,
6,LR,bootstrapped-validation-mutations,98.539608,0.633964,46.422939,4.605406,86.183604,,,,,,
7,NN,bootstrapped-validation-mutations,94.848744,1.220856,56.079319,3.478481,77.655329,,,,,,
8,XB,bootstrapped-validation-mutations,97.875205,0.881279,63.210372,3.527627,89.301213,,,,,,
9,LR,bootstrapped-mic,100.0,0.0,0.0,0.0,65.335807,,,,,,


In [13]:
test_results[(test_results.model=='XB') & (test_results.dataset!='validation-samplesnoU') & (test_results.dataset.str.contains('validation-samplesnoU'+"_"))]

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters
106,XB,validation-samplesnoU_0,97.424893,,61.21673,,86.243167,,483,306,60,2270,
107,XB,validation-samplesnoU_1,97.309991,,66.409266,,87.141555,,516,261,63,2279,
108,XB,validation-samplesnoU_2,98.047722,,61.425061,,86.349166,,500,314,45,2260,
109,XB,validation-samplesnoU_3,98.088889,,65.362486,,88.383557,,568,301,43,2207,
110,XB,validation-samplesnoU_4,98.001738,,63.647491,,87.115589,,520,297,46,2256,
111,XB,validation-samplesnoU_5,98.085106,,65.019506,,88.235371,,500,269,45,2305,
112,XB,validation-samplesnoU_6,97.245763,,62.977602,,86.161514,,478,281,65,2295,
113,XB,validation-samplesnoU_7,97.389034,,63.215591,,86.979755,,519,302,60,2238,
114,XB,validation-samplesnoU_8,97.637795,,61.704682,,85.934899,,514,319,54,2232,
115,XB,validation-samplesnoU_9,97.838305,,62.903226,,86.536906,,507,299,50,2263,


In [14]:
for dataset in ['validation-samples', 'validation-samplesnoU', 'validation-mutations', 'mic']:
    for metric in ['sensitivity_mean', 'specificity_mean']:
        for i in ['XB', 'NN', 'LR']:
            for j in ['XB', 'NN', 'LR']:
                if i!=j:
                    a = numpy.array(test_results[(test_results.model==i)& (test_results.dataset.str.contains(dataset+'_'))][metric])
                    b = numpy.array(test_results[(test_results.model==j)& (test_results.dataset.str.contains(dataset+'_'))][metric])
                    assert len(a) == number_of_bootstraps
                    tstat, pvalue = ztest(a,b)
                    if pvalue < 0.05:
                        print(dataset,metric, i,j,"Signficant",pvalue)
        print()

validation-samples sensitivity_mean XB NN Signficant 4.525453354019123e-230
validation-samples sensitivity_mean XB LR Signficant 9.353693646389704e-12
validation-samples sensitivity_mean NN XB Signficant 4.525453354019123e-230
validation-samples sensitivity_mean NN LR Signficant 2.0842450276605338e-298
validation-samples sensitivity_mean LR XB Signficant 9.353693646389704e-12
validation-samples sensitivity_mean LR NN Signficant 2.0842450276605338e-298

validation-samples specificity_mean XB NN Signficant 9.939685626049165e-19
validation-samples specificity_mean XB LR Signficant 8.40611995586018e-17
validation-samples specificity_mean NN XB Signficant 9.939685626049165e-19
validation-samples specificity_mean NN LR Signficant 1.5301183275354006e-81
validation-samples specificity_mean LR XB Signficant 8.40611995586018e-17
validation-samples specificity_mean LR NN Signficant 1.5301183275354006e-81

validation-samplesnoU sensitivity_mean XB NN Signficant 2.750714768725531e-177
validation-sa

  zstat = (value1 - value2 - diff) / std_diff


In [18]:
# test_results = pandas.DataFrame(line, columns=['model', 'dataset', 'sensitivity_mean', 'sensitivity_std', 'specificity_mean', 'specificity_std' ,'roc_auc_mean', 'roc_auc_std','TN','FP','FN','TP', 'model_parameters'])

# calculate the diagnostic odds ration
# test_results['diagnostic_odds_ratio_mean'] = (test_results['TN']*test_results['TP'])/(test_results['FN']*test_results['FP'])
test_results['diagnostic_odds_ratio_mean'] = 0
test_results['diagnostic_odds_ratio_std'] = 0

# save to disc as a CSV
test_results.to_csv('results-validation.csv', index=False)

test_results[:20]

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters,diagnostic_odds_ratio_mean,diagnostic_odds_ratio_std
0,NN,validation-samples,90.517241,,51.649236,,73.731156,,642,601,264,2520,,0,0
1,NN,validation-samplesnoU,91.479239,,66.666667,,80.821306,,538,269,197,2115,,0,0
2,NN,validation-mutations,94.193548,,52.272727,,77.565982,,23,21,9,146,,0,0
3,NN,mic,94.0,,42.857143,,63.142857,,3,4,3,47,,0,0
4,XB,validation-samples,97.198276,,46.017699,,80.722592,,572,671,78,2706,,0,0
5,XB,validation-samplesnoU,97.621107,,63.07311,,86.880823,,509,298,55,2257,,0,0
6,XB,validation-mutations,97.419355,,59.090909,,88.284457,,26,18,4,151,,0,0
7,XB,mic,100.0,,14.285714,,67.714286,,1,6,0,50,,0,0
8,LR,validation-samples,98.060345,,40.305712,,79.642377,,501,742,54,2730,,0,0
9,LR,validation-samplesnoU,98.442907,,56.133829,,84.84152,,453,354,36,2276,,0,0


In [19]:
test_results.model.value_counts()

model
NN    48
XB    48
LR    48
SP     4
Name: count, dtype: int64

Finally let's see how well the published `SuspectPZA` models does on these validtion datasets

In [20]:
test_results[test_results.model=='SP']

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters,diagnostic_odds_ratio_mean,diagnostic_odds_ratio_std
12,SP,validation-samples,93.678161,,44.328238,,,,551,692,176,2608,,0,0
13,SP,validation-samplesnoU,96.107266,,57.372986,,,,463,344,90,2222,,0,0
14,SP,validation-mutations,92.258065,,47.727273,,,,21,23,12,143,,0,0
15,SP,mic,92.0,,14.285714,,,,1,6,4,46,,0,0
