In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn import metrics

In [None]:
# grab a pre-split curated dataset that is 50% of original data
y = pd.read_csv('test50.csv', sep=',',low_memory=False)
x = y[['PCHANGE', 'GENE', 'Domain', 'ClinicalSignificance','EFFECT', 'Civic_Evidence', 'CATEGORIZATION', 'FAF', 'GNOMAD_AC', 'GNOMAD_AF', 'EXON_Rank', 'COSMIC_CNT','MVP_score', 'Civic_Drug', 'PMID_COUNT', 'KEGG']]
x = x.drop_duplicates()
x.head()

In [3]:
x['CATEGORIZATION'].value_counts()

CATEGORIZATION
Benign                387158
Disease Associated     18027
VOUS                   16367
Probably DA             1144
Likely Benign            864
Name: count, dtype: int64

In [4]:
x = x.groupby('CATEGORIZATION').sample(500)
x['CATEGORIZATION'].value_counts()

In [8]:
s = x.drop('CATEGORIZATION', axis=1)
s.fillna(-999, inplace=True)

In [10]:
#reviewer asked we only use snvs for mpv data...
l = x[x['PCHANGE'].str.contains('fs|ins|del|\\*')==False]
s2 = l.drop('CATEGORIZATION', axis=1)
s2 = s2.drop('CPDID', axis=1)
s2.fillna(-999, inplace=True)

In [11]:
#here is where we will start itterating over 1 PUBLIC resource value
gnomadS = s[['GENE','PCHANGE', 'GNOMAD_AF', 'GNOMAD_AC']]
cosmicS = s[['GENE','PCHANGE', 'COSMIC_CNT']]
pmidS= s[['GENE','PCHANGE', 'PMID_COUNT']]
mvpS = s2[['GENE','PCHANGE', 'MVP_score']]
clinvarS = s[['GENE','PCHANGE', 'ClinicalSignificance']]
keggS = s[['GENE','PCHANGE', 'KEGG']]
civicS = s[['GENE','PCHANGE','Civic_Evidence', 'Civic_Drug']]

In [13]:
#load models
gnomadM = CatBoostClassifier()
gnomadM.load_model('gnomad_only.json', format='json')
cosmicM = CatBoostClassifier()
cosmicM.load_model('cosmic_only.json', format='json')
pmidM = CatBoostClassifier()
pmidM.load_model('pmid_only.json', format='json')
mvpM = CatBoostClassifier()
mvpM.load_model('mvp_only.json', format='json')
clinvarM = CatBoostClassifier()
clinvarM.load_model('clinvar_only.json', format='json')
keggM = CatBoostClassifier()
keggM.load_model('kegg_only.json', format='json')
civicM = CatBoostClassifier()
civicM.load_model('civic_only.json', format='json')
compM = CatBoostClassifier()
compM.load_model('azurify.json', format='json')

<catboost.core.CatBoostClassifier at 0x20cf3ce2e10>

In [14]:
gnomad_y_score = gnomadM.predict_proba(X=gnomadS)
cosmic_y_score = cosmicM.predict_proba(X=cosmicS)
pmid_y_score = pmidM.predict_proba(X=pmidS)
mvp_y_score = mvpM.predict_proba(X=mvpS)
clinvar_y_score = clinvarM.predict_proba(X=clinvarS)
kegg_y_score = keggM.predict_proba(X=keggS)
civic_y_score = civicM.predict_proba(X=civicS)
comp_y_score = compM.predict_proba(X=s)

In [None]:
def multi_roc(n, truth, y_score):
    n_classes = n
    y_test = truth.values
    y_test_bin = label_binarize(y_test, classes=(truth.unique()))

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    return fpr,tpr,roc_auc

In [22]:
gfpr, gtpr, gra = multi_roc(5, x['CATEGORIZATION'], gnomad_y_score)
cfpr, ctpr, cra = multi_roc(5, x['CATEGORIZATION'], cosmic_y_score)
pfpr, ptpr, pra = multi_roc(5, x['CATEGORIZATION'], pmid_y_score)
mfpr, mtpr, mra = multi_roc(5, l['CATEGORIZATION'], mvp_y_score)
cvfpr, cvtpr, cvra = multi_roc(5, x['CATEGORIZATION'], clinvar_y_score)
kfpr, ktpr, kra = multi_roc(5, x['CATEGORIZATION'], kegg_y_score)
cifpr, citpr, cira = multi_roc(5, x['CATEGORIZATION'], civic_y_score)
afpr, atpr, ara = multi_roc(5, x['CATEGORIZATION'], comp_y_score)


In [23]:
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["figure.figsize"] = (8,6)
plt.rcParams.update({'font.size': 16})


classes = ['Benign', 'Pathogenic', 'Likely Benign', 'Likely Pathogenic', 'VUS']

for i in range(len(classes)):
    plt.plot(afpr[i], atpr[i], label='Azurify (AP = {1:0.2f})'''.format(i, ara[i], color="#011F5B"))
    plt.plot(gfpr[i], gtpr[i], label='Gnomad (AP = {1:0.2f})'''.format(i, gra[i]))
    plt.plot(mfpr[i], mtpr[i], label='MVP (AP = {1:0.2f})'''.format(i, mra[i]))
    plt.plot(cfpr[i], ctpr[i], label='COSMIC (AP = {1:0.2f})'''.format(i, cra[i]))
    plt.plot(pfpr[i], ptpr[i], label='PubMed Count (AP = {1:0.2f})'''.format(i, pra[i]))
    plt.plot(cvfpr[i], cvtpr[i], label='ClinVar (AP = {1:0.2f})'''.format(i, cvra[i]))
    plt.plot(kfpr[i], ktpr[i], label='KEGG (AP = {1:0.2f})'''.format(i, kra[i]))
    plt.plot(cifpr[i], citpr[i], label='CiVic (AP = {1:0.2f})'''.format(i, cira[i]))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(('One-vs-Rest ROC curves: ' + classes[i]))
    plt.legend(loc='lower right')
    plt.savefig((classes[i] + '.OvR.pdf'), dpi=600)
    plt.clf()
    plt.close("all")
