In [136]:
import pandas, numpy

from skops.io import load

from misc import construct_line

In [137]:
best_model = {}

for model in ['LR', 'NN', 'XB']:
    best_model[model] = load('models/'+model.lower()+'.skops', trusted=True)

X={}
Y={}
Z={}

for i in ['validation-samples', 'validation-mutations', 'mic', 'train']:
    X[i]={}
    Y[i]={}
    Z[i]={}
    with open('data/ds-'+i+'.npy', 'rb') as f:
        Y[i]['input'] = numpy.load(f)
        X[i]['input'] = numpy.load(f)
        Z[i]['input'] = numpy.load(f, allow_pickle=True)

# load in the results for SuspectPZA
# suspectpza={}
# for i in ['validation-samples', 'validation-mutations','mic']:
#     suspectpza[i]={}
#     with open('data/suspectpza-'+i+'.npy', 'rb') as f:
#         suspectpza[i]['input'] = numpy.load(f)
#         suspectpza[i]['predicted'] = numpy.load(f)
#         suspectpza[i]['muts'] = numpy.load(f, allow_pickle=True)
line = []

In [138]:
(Z['validation-mutations']['input'])

array(['M1I', 'M1T', 'A3E', 'L4S', 'I5S', 'I5T', 'I6L', 'I6S', 'I6T',
       'I6V', 'V7A', 'V7F', 'V7G', 'V7L', 'D8A', 'D8E', 'D8G', 'D8N',
       'V9G', 'Q10H', 'Q10K', 'Q10P', 'Q10R', 'D12A', 'D12E', 'D12N',
       'F13I', 'F13V', 'C14R', 'E15G', 'G17D', 'G17S', 'L19P', 'V21A',
       'V21G', 'G24D', 'L27P', 'R29P', 'A30V', 'I31S', 'Y34D', 'L35R',
       'H43P', 'H43Y', 'V44G', 'V45M', 'A46E', 'A46T', 'T47I', 'T47P',
       'K48N', 'D49A', 'D49G', 'D49N', 'H51D', 'H51P', 'H51Q', 'H51R',
       'H51Y', 'P54L', 'P54Q', 'P54S', 'H57D', 'H57L', 'H57P', 'H57Q',
       'H57R', 'H57Y', 'F58V', 'S59P', 'P62L', 'P62R', 'P62T', 'D63A',
       'S65P', 'S66L', 'S67P', 'W68C', 'W68G', 'W68L', 'W68R', 'W68S',
       'P69L', 'P69S', 'H71P', 'H71Q', 'H71Y', 'C72R', 'C72Y', 'T76I',
       'T76P', 'A79T', 'A79V', 'F81C', 'F81S', 'H82D', 'H82R', 'L85P',
       'L85R', 'T87M', 'I90T', 'F94C', 'F94S', 'K96E', 'K96N', 'K96R',
       'K96T', 'G97C', 'G97D', 'G97R', 'G97S', 'G97V', 'T100P', 'A102R',
       

In [139]:
def validate_model(line, best_model, model_name, dataset, X, Y):
    
    Y[dataset]['predicted'] = best_model.predict(X[dataset]['input'])
    Y[dataset]['scores'] = best_model.predict_proba(X[dataset]['input'])[:,1]

    row = construct_line(model_name , dataset, None, Y[dataset], None)

    n_muts_unique = (numpy.unique(Z[dataset]['input'])).shape[0]

    row.append(X[dataset]['input'].shape[0])
    row.append(n_muts_unique)
    line.append(row)

    return(line)

In [140]:
for dataset in ['validation-samples', 'validation-mutations', 'mic']: 
    line = validate_model(line, best_model['XB'], 'XB', dataset, X, Y)

In [141]:
test_results = pandas.DataFrame(line, columns=['model', 'dataset', 'sensitivity_mean', 'sensitivity_std', 'specificity_mean', 'specificity_std' ,'roc_auc_mean', 'roc_auc_std','TN','FP','FN','TP', 'model_parameters', 'n_samples', 'n_mutations'])
test_results

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters,n_samples,n_mutations
0,XB,validation-samples,97.198276,,46.017699,,80.722592,,572,671,78,2706,,4027,367
1,XB,validation-mutations,97.419355,,59.090909,,88.284457,,26,18,4,151,,199,199
2,XB,mic,100.0,,14.285714,,67.714286,,1,6,0,50,,57,57


In [142]:
validation_mutations = pandas.read_csv('data/ds-validation-mutations-full.csv')
validation_mutations[:3]

Unnamed: 0,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,RELIABLE_PHENOTYPE,CONSISTENT_PHENOTYPE,IS_NONSYN
0,A102P,True,True,8,4,12,False,U,True
1,A102R,True,True,0,3,3,True,S,True
2,A102T,True,True,3,0,3,True,R,True


In [143]:
mutations_with_inconsistent_phenotypes = validation_mutations[validation_mutations.CONSISTENT_PHENOTYPE=='U'].MUTATION.unique()

In [144]:
total = validation_mutations[validation_mutations.CONSISTENT_PHENOTYPE=='U'].TOTAL.sum()
total_lt4 = validation_mutations[(validation_mutations.CONSISTENT_PHENOTYPE=='U') & (validation_mutations.TOTAL<4)].TOTAL.sum()
total_ge4 = validation_mutations[(validation_mutations.CONSISTENT_PHENOTYPE=='U') & (validation_mutations.TOTAL>=4)].TOTAL.sum()

In [145]:
total, total_lt4, total_ge4

(908, 160, 748)

In [146]:
mask = numpy.isin(Z['validation-samples']['input'], mutations_with_inconsistent_phenotypes)

X['validation-inconsistent-phenotypes-samples']={}
Y['validation-inconsistent-phenotypes-samples']={}
Z['validation-inconsistent-phenotypes-samples']={}

X['validation-inconsistent-phenotypes-samples']['input']=X['validation-samples']['input'][mask]
Y['validation-inconsistent-phenotypes-samples']['input']=Y['validation-samples']['input'][mask]
Z['validation-inconsistent-phenotypes-samples']['input']=Z['validation-samples']['input'][mask]

mask = ~numpy.isin(Z['validation-samples']['input'], mutations_with_inconsistent_phenotypes)

X['validation-consistent-phenotypes-samples']={}
Y['validation-consistent-phenotypes-samples']={}
Z['validation-consistent-phenotypes-samples']={}

X['validation-consistent-phenotypes-samples']['input']=X['validation-samples']['input'][mask]
Y['validation-consistent-phenotypes-samples']['input']=Y['validation-samples']['input'][mask]
Z['validation-consistent-phenotypes-samples']['input']=Z['validation-samples']['input'][mask]

In [147]:
for dataset in ['validation-inconsistent-phenotypes-samples', 'validation-consistent-phenotypes-samples']: 
    line = validate_model(line, best_model['XB'], 'XB', dataset, X, Y)

In [148]:
mask = numpy.isin(Z['validation-samples']['input'],  Z['train']['input'])

X['validation-consistent-phenotypes-train-samples']={}
Y['validation-consistent-phenotypes-train-samples']={}
Z['validation-consistent-phenotypes-train-samples']={}

X['validation-consistent-phenotypes-train-samples']['input']=X['validation-samples']['input'][mask]
Y['validation-consistent-phenotypes-train-samples']['input']=Y['validation-samples']['input'][mask]
Z['validation-consistent-phenotypes-train-samples']['input']=Z['validation-samples']['input'][mask]

mask = ~numpy.isin(Z['validation-samples']['input'],  Z['train']['input'])

X['validation-consistent-phenotypes-nottrain-samples']={}
Y['validation-consistent-phenotypes-nottrain-samples']={}
Z['validation-consistent-phenotypes-nottrain-samples']={}

X['validation-consistent-phenotypes-nottrain-samples']['input']=X['validation-samples']['input'][mask]
Z['validation-consistent-phenotypes-nottrain-samples']['input']=Z['validation-samples']['input'][mask]

Y['validation-consistent-phenotypes-nottrain-samples']['input']=Y['validation-samples']['input'][mask]


In [149]:
for dataset in ['validation-consistent-phenotypes-train-samples', 'validation-consistent-phenotypes-nottrain-samples']: 
    line = validate_model(line, best_model['XB'], 'XB', dataset, X, Y)

In [150]:
test_results = pandas.DataFrame(line, columns=['model', 'dataset', 'sensitivity_mean', 'sensitivity_std', 'specificity_mean', 'specificity_std' ,'roc_auc_mean', 'roc_auc_std','TN','FP','FN','TP', 'model_parameters', 'n_samples', 'n_mutations'])
test_results

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters,n_samples,n_mutations
0,XB,validation-samples,97.198276,,46.017699,,80.722592,,572,671,78,2706,,4027,367
1,XB,validation-mutations,97.419355,,59.090909,,88.284457,,26,18,4,151,,199,199
2,XB,mic,100.0,,14.285714,,67.714286,,1,6,0,50,,57,57
3,XB,validation-inconsistent-phenotypes-samples,95.127119,,14.449541,,57.618615,,63,373,23,449,,908,168
4,XB,validation-consistent-phenotypes-samples,97.621107,,63.07311,,86.880823,,509,298,55,2257,,3119,199
5,XB,validation-consistent-phenotypes-train-samples,98.020955,,56.69383,,82.677322,,487,372,34,1684,,2577,187
6,XB,validation-consistent-phenotypes-nottrain-samples,95.87242,,22.135417,,75.236598,,85,299,44,1022,,1450,180
