# `12-create-figure-5`

In [1]:
import pandas, numpy, pathlib

import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from skops.io import load

from misc import construct_line

pathlib.Path('pdf').mkdir(exist_ok=True)
pathlib.Path('pdf/figure-5').mkdir(exist_ok=True)

In [2]:
chosen_model = 'XB'

best_model = {}

for model in ['LR', 'NN', 'XB']:
    best_model[model] = load('models/'+model.lower()+'.skops', trusted=True)

X={}
Y={}
Z={}

for i in ['validation-samples', 'validation-mutations', 'mic', 'train', 'test']:
    X[i]={}
    Y[i]={}
    Z[i]={}
    with open('data/ds-'+i+'.npy', 'rb') as f:
        Y[i]['input'] = numpy.load(f)
        X[i]['input'] = numpy.load(f)
        Z[i]['input'] = numpy.load(f, allow_pickle=True)

# load in the results for SuspectPZA
suspectpza={}
for i in ['validation-samples', 'validation-mutations','mic']:
    suspectpza[i]={}
    with open('data/suspectpza-'+i+'.npy', 'rb') as f:
        suspectpza[i]['input'] = numpy.load(f)
        suspectpza[i]['predicted'] = numpy.load(f)
        suspectpza[i]['muts'] = numpy.load(f, allow_pickle=True)
line = []

In [3]:
def validate_model(line, best_model, model_name, dataset, X, Y):
    
    Y[dataset]['predicted'] = best_model.predict(X[dataset]['input'])
    Y[dataset]['scores'] = best_model.predict_proba(X[dataset]['input'])[:,1]

    row = construct_line(model_name , dataset, None, Y[dataset], None)

    n_muts_unique = (numpy.unique(Z[dataset]['input'])).shape[0]

    row.append(X[dataset]['input'].shape[0])
    row.append(n_muts_unique)
    line.append(row)

    return(line)

First, let's work out which of the mutations in the Validation dataset have inconsistent mutations

In [4]:
validation_mutations = pandas.read_csv('data/ds-validation-mutations-full.csv')

mutations_with_consistent_phenotypes= set(validation_mutations[validation_mutations.CONSISTENT_PHENOTYPE.isin(['R', 'S'])].MUTATION.unique())

mutations_with_inconsistent_phenotypes = set(validation_mutations[validation_mutations.CONSISTENT_PHENOTYPE=='U'].MUTATION.unique())

In [5]:
len(mutations_with_inconsistent_phenotypes), len(mutations_with_consistent_phenotypes)

(168, 199)

In [6]:
mask = numpy.isin(Z['validation-samples']['input'], numpy.array(list(mutations_with_inconsistent_phenotypes)))

X['vs-incons']={}
Y['vs-incons']={}
Z['vs-incons']={}

X['vs-incons']['input']=X['validation-samples']['input'][mask]
Y['vs-incons']['input']=Y['validation-samples']['input'][mask]
Z['vs-incons']['input']=Z['validation-samples']['input'][mask]

mask = numpy.isin(Z['validation-samples']['input'], numpy.array(list(mutations_with_consistent_phenotypes)))

X['vs-consis']={}
Y['vs-consis']={}
Z['vs-consis']={}

X['vs-consis']['input']=X['validation-samples']['input'][mask]
Y['vs-consis']['input']=Y['validation-samples']['input'][mask]
Z['vs-consis']['input']=Z['validation-samples']['input'][mask]

mask = numpy.isin(Z['validation-mutations']['input'], numpy.array(list(mutations_with_consistent_phenotypes)))

X['vm-consis']={}
Y['vm-consis']={}
Z['vm-consis']={}

X['vm-consis']['input']=X['validation-mutations']['input'][mask]
Y['vm-consis']['input']=Y['validation-mutations']['input'][mask]
Z['vm-consis']['input']=Z['validation-mutations']['input'][mask]

# mask = numpy.isin(Z['validation-mutations']['input'], numpy.array(list(mutations_with_consistent_phenotypes)))

# X['vm-incons']={}
# Y['vm-incons']={}
# Z['vm-incons']={}

# X['vm-incons']['input']=X['validation-mutations']['input'][mask]
# Y['vm-incons']['input']=Y['validation-mutations']['input'][mask]
# Z['vm-incons']['input']=Z['validation-mutations']['input'][mask]


In [7]:
for dataset in ['validation-samples', 'validation-mutations',  'mic', 'vs-incons', 'vs-consis', 'vm-consis']: #, 'vm-incons']: 
    line = validate_model(line, best_model[chosen_model], chosen_model, dataset, X, Y)

In [8]:
total = validation_mutations.TOTAL.sum()

print("There are %i samples in the validation dataset" % (total))

total = validation_mutations[validation_mutations.CONSISTENT_PHENOTYPE=='U'].TOTAL.sum()

print("Of these, %i have a mutation which has an inconsistent phenotype" % (total))

total_lt4 = validation_mutations[(validation_mutations.CONSISTENT_PHENOTYPE=='U') & (validation_mutations.TOTAL<4)].TOTAL.sum()
total_ge4 = validation_mutations[(validation_mutations.CONSISTENT_PHENOTYPE=='U') & (validation_mutations.TOTAL>=4)].TOTAL.sum()
print("And %i of these are samples were there are fewer than 4 samples and %i are samples with five or more samples" % (total_lt4, total_ge4))

There are 4027 samples in the validation dataset
Of these, 908 have a mutation which has an inconsistent phenotype
And 160 of these are samples were there are fewer than 4 samples and 748 are samples with five or more samples


In [9]:
mutations_in_train  = set(Z['train']['input'])  #.union(set(Z['test']['input']))

mutations_with_consistent_phenotypes_in_train = mutations_with_consistent_phenotypes.intersection(mutations_in_train)
mutations_with_consistent_phenotypes_notin_train = mutations_with_consistent_phenotypes - mutations_in_train

mutations_with_inconsistent_phenotypes_in_train = mutations_with_inconsistent_phenotypes.intersection(mutations_in_train)
mutations_with_inconsistent_phenotypes_notin_train = mutations_with_inconsistent_phenotypes - mutations_in_train 

len(mutations_with_consistent_phenotypes_in_train), len(mutations_with_consistent_phenotypes_notin_train), len(mutations_with_inconsistent_phenotypes_in_train), len(mutations_with_inconsistent_phenotypes_notin_train)

(116, 83, 71, 97)

In [10]:
for i in ['vs-consis-train', 'vs-consis-notrain', 'vs-incons-train', 'vs-incons-notrain', 'vm-consis-train', 'vm-consis-notrain', 'vm-incons-train', 'vm-incons-notrain']:
    X[i] = {}
    Y[i] = {}
    Z[i] = {}

mask = numpy.isin(Z['validation-samples']['input'],  numpy.array(list(mutations_with_consistent_phenotypes_in_train)))
X['vs-consis-train']['input']=X['validation-samples']['input'][mask]
Y['vs-consis-train']['input']=Y['validation-samples']['input'][mask]
Z['vs-consis-train']['input']=Z['validation-samples']['input'][mask]

mask = numpy.isin(Z['validation-samples']['input'],  numpy.array(list(mutations_with_consistent_phenotypes_notin_train)))
X['vs-consis-notrain']['input']=X['validation-samples']['input'][mask]
Y['vs-consis-notrain']['input']=Y['validation-samples']['input'][mask]
Z['vs-consis-notrain']['input']=Z['validation-samples']['input'][mask]

mask = numpy.isin(Z['validation-samples']['input'],  numpy.array(list(mutations_with_inconsistent_phenotypes_in_train)))
X['vs-incons-train']['input']=X['validation-samples']['input'][mask]
Z['vs-incons-train']['input']=Z['validation-samples']['input'][mask]
Y['vs-incons-train']['input']=Y['validation-samples']['input'][mask]

mask = numpy.isin(Z['validation-samples']['input'],  numpy.array(list(mutations_with_inconsistent_phenotypes_notin_train)))
X['vs-incons-notrain']['input']=X['validation-samples']['input'][mask]
Z['vs-incons-notrain']['input']=Z['validation-samples']['input'][mask]
Y['vs-incons-notrain']['input']=Y['validation-samples']['input'][mask]

mask = numpy.isin(Z['validation-mutations']['input'],  numpy.array(list(mutations_with_consistent_phenotypes_in_train)))
X['vm-consis-train']['input']=X['validation-mutations']['input'][mask]
Y['vm-consis-train']['input']=Y['validation-mutations']['input'][mask]
Z['vm-consis-train']['input']=Z['validation-mutations']['input'][mask]


mask = numpy.isin(Z['validation-mutations']['input'],  numpy.array(list(mutations_with_consistent_phenotypes_in_train)))
X['vm-consis-train']['input']=X['validation-mutations']['input'][mask]
Y['vm-consis-train']['input']=Y['validation-mutations']['input'][mask]
Z['vm-consis-train']['input']=Z['validation-mutations']['input'][mask]

mask = numpy.isin(Z['validation-mutations']['input'],  numpy.array(list(mutations_with_consistent_phenotypes_notin_train)))
X['vm-consis-notrain']['input']=X['validation-mutations']['input'][mask]
Y['vm-consis-notrain']['input']=Y['validation-mutations']['input'][mask]
Z['vm-consis-notrain']['input']=Z['validation-mutations']['input'][mask]


# mask = numpy.isin(Z['validation-mutations']['input'],  numpy.array(list(mutations_with_inconsistent_phenotypes_in_train)))
# X['vm-incons-train']['input']=X['validation-mutations']['input'][mask]
# Z['vm-incons-train']['input']=Z['validation-mutations']['input'][mask]
# Y['vm-incons-train']['input']=Y['validation-mutations']['input'][mask]

# mask = numpy.isin(Z['validation-mutations']['input'],  numpy.array(list(mutations_with_inconsistent_phenotypes_notin_train)))
# X['vm-incons-notrain']['input']=X['validation-mutations']['input'][mask]
# Z['vm-incons-notrain']['input']=Z['validation-mutations']['input'][mask]
# Y['vm-incons-notrain']['input']=Y['validation-mutations']['input'][mask]


for dataset in ['vs-consis-train', 'vs-consis-notrain', 'vs-incons-train', 'vs-incons-notrain', 'vm-consis-train', 'vm-consis-notrain']: # 'vm-incons-train', 'vm-incons-notrain']:
    print(dataset)
    line = validate_model(line, best_model[chosen_model], chosen_model, dataset, X, Y)

vs-consis-train
vs-consis-notrain
vs-incons-train
vs-incons-notrain
vm-consis-train
vm-consis-notrain


In [11]:
mutations_with_consistent_phenotypes_notin_train_large = set(list(validation_mutations[(validation_mutations.MUTATION.isin(mutations_with_consistent_phenotypes_notin_train)) & (validation_mutations.TOTAL>=4)].MUTATION.unique()))
mutations_with_consistent_phenotypes_notin_train_small = set(list(validation_mutations[(validation_mutations.MUTATION.isin(mutations_with_consistent_phenotypes_notin_train)) & (validation_mutations.TOTAL<4)].MUTATION.unique()))

for i in ['vs-consis-notrain-large', 'vs-consis-notrain-small']:
    X[i] = {}
    Y[i] = {}
    Z[i] = {}

mask = numpy.isin(Z['validation-samples']['input'],  numpy.array(list(mutations_with_consistent_phenotypes_notin_train_large)))
X['vs-consis-notrain-large']['input']=X['validation-samples']['input'][mask]
Z['vs-consis-notrain-large']['input']=Z['validation-samples']['input'][mask]
Y['vs-consis-notrain-large']['input']=Y['validation-samples']['input'][mask]

mask = numpy.isin(Z['validation-samples']['input'],  numpy.array(list(mutations_with_consistent_phenotypes_notin_train_small)))
X['vs-consis-notrain-small']['input']=X['validation-samples']['input'][mask]
Z['vs-consis-notrain-small']['input']=Z['validation-samples']['input'][mask]
Y['vs-consis-notrain-small']['input']=Y['validation-samples']['input'][mask]

for dataset in ['vs-consis-notrain-large', 'vs-consis-notrain-small']:
    print(dataset)
    line = validate_model(line, best_model[chosen_model], chosen_model, dataset, X, Y)

vs-consis-notrain-large
vs-consis-notrain-small


In [12]:
'resid '+ ' '.join([i[1:-1] for i in mutations_with_consistent_phenotypes_notin_train_large])

'resid 45 82 21 175 135 119 157 24 10 130 146 8 68 182 97 79 12 125 46 120 103 57 97 100 156 180 128 108 35 94 47 172 69 7 146 143 8 49 62 81 21 10 159 143 1 65 71 116 63 67 96'

In [13]:
'resid '+ ' '.join([i[1:-1] for i in mutations_with_consistent_phenotypes_notin_train_small])

'resid 172 102 68 57 68 13 10 10 6 46 43 57 146 137 119 104 102 81 58 168 167 119 140 90 6 15 43 124 157 17 48 13'

In [14]:
test_results = pandas.DataFrame(line, columns=['model', 'dataset', 'sensitivity_mean', 'sensitivity_std', 'specificity_mean', 'specificity_std' ,'roc_auc_mean', 'roc_auc_std','TN','FP','FN','TP', 'model_parameters', 'n_samples', 'n_mutations'])
test_results

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters,n_samples,n_mutations
0,XB,validation-samples,97.198276,,46.017699,,80.722592,,572,671,78,2706,,4027,367
1,XB,validation-mutations,97.419355,,59.090909,,88.284457,,26,18,4,151,,199,199
2,XB,mic,100.0,,14.285714,,67.714286,,1,6,0,50,,57,57
3,XB,vs-incons,95.127119,,14.449541,,57.618615,,63,373,23,449,,908,168
4,XB,vs-consis,97.621107,,63.07311,,86.880823,,509,298,55,2257,,3119,199
5,XB,vm-consis,97.419355,,59.090909,,88.284457,,26,18,4,151,,199,199
6,XB,vs-consis-train,98.250525,,75.609756,,89.577224,,465,150,25,1404,,2044,116
7,XB,vs-consis-notrain,96.602492,,22.916667,,75.711058,,44,148,30,853,,1075,83
8,XB,vs-incons-train,96.885813,,9.016393,,56.030546,,22,222,9,280,,533,71
9,XB,vs-incons-notrain,92.349727,,21.354167,,61.111111,,41,151,14,169,,375,97


In [15]:
for dataset in list(test_results.dataset):
    print(dataset)
    colour='#888888'
    fig = plt.figure(figsize=(1.8, 3.5))
    axes = plt.gca()
    axes.spines['top'].set_visible(False)
    axes.spines['right'].set_visible(False)
    axes.spines['left'].set_visible(False)
    axes.get_yaxis().set_visible(False)
    x=[0,1]
    y=test_results[(test_results.dataset==dataset)][['sensitivity_mean','specificity_mean']].values[0]
    axes.set_ylim([0,100])
    axes.bar(x,y, label=y, edgecolor=colour, color='None',linewidth=2)
    axes.set_xticks([0,1])
    axes.set_xticklabels(['sensitivity', 'specificity'])
    for (i,j) in zip(x,y):
        axes.text(i,j+2,'%.1f' % j,ha='center',color=colour)
    fig.savefig('pdf/figure-5/fig-5-'+str(dataset)+'.pdf',bbox_inches='tight',transparent=True)
    plt.close()

validation-samples
validation-mutations


mic
vs-incons
vs-consis
vm-consis
vs-consis-train
vs-consis-notrain
vs-incons-train
vs-incons-notrain
vm-consis-train
vm-consis-notrain
vs-consis-notrain-large
vs-consis-notrain-small


In [16]:
for idx,row in test_results.iterrows():

    fig = plt.figure(figsize=(1.5, 1.5))
    axes = plt.gca()

    axes.add_patch(Rectangle((0,0),1,1,fc='#e41a1c',alpha=0.7))
    axes.add_patch(Rectangle((0,1),1,1,fc='#4daf4a',alpha=0.7))
    axes.add_patch(Rectangle((1,1),1,1,fc='#fc9272',alpha=0.7))
    axes.add_patch(Rectangle((1,0),1,1,fc='#4daf4a',alpha=0.7))

    axes.set_xlim([0,2])
    axes.set_ylim([0,2])

    axes.set_xticks([0.5,1.5],labels=['R','S'])
    axes.set_yticks([0.5,1.5],labels=['S','R'])

    axes.text(0.5,0.5,row['FN'],ha='center',va='center')
    axes.text(1.5,0.5,row['TN'],ha='center',va='center')
    axes.text(0.5,1.5,row['TP'],ha='center',va='center')
    axes.text(1.5,1.5,row['FP'],ha='center',va='center')

    fig.savefig('pdf/figure-5/truthtable-'+row['dataset']+'-'+row['model']+'.pdf',bbox_inches='tight',transparent=True)
    plt.close()

In [17]:
total_lt4 = validation_mutations[(validation_mutations.MUTATION.isin(mutations_with_consistent_phenotypes_notin_train)) & (validation_mutations.TOTAL>=4)].TOTAL.sum()
len_lt4 = len(validation_mutations[(validation_mutations.MUTATION.isin(mutations_with_consistent_phenotypes_notin_train)) & (validation_mutations.TOTAL>=4)])
total_lt4, len_lt4

(995, 51)

In [18]:
validation_mutations

Unnamed: 0,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,RELIABLE_PHENOTYPE,CONSISTENT_PHENOTYPE,IS_NONSYN
0,A102P,True,True,8,4,12,False,U,True
1,A102R,True,True,0,3,3,True,S,True
2,A102T,True,True,3,0,3,True,R,True
3,A102V,True,True,12,13,25,False,U,True
4,A134D,True,True,0,1,1,False,U,True
...,...,...,...,...,...,...,...,...,...
362,Y41C,True,True,0,1,1,False,U,True
363,Y64D,True,True,2,4,6,False,U,True
364,Y64S,True,True,1,2,3,False,U,True
365,Y95D,True,True,1,0,1,False,U,True
