In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, accuracy_score, auc, precision_recall_curve, average_precision_score
from math import sqrt

In [2]:
data = pd.read_feather(r'/media/kchen/2TB/kchen_backup/ssi/data/procol_train.feather')
y = data['ssi']
X = data.drop(['SUPINFEC','WNDINFD','ORGSPCSSI','ssi'], axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

test = pd.read_feather(r'/media/kchen/2TB/kchen_backup/ssi/data/procol_test.feather')
y_test = test['ssi']
X_test = test.drop(['SUPINFEC','WNDINFD','ORGSPCSSI','ssi'], axis=1)

In [3]:
def evaluate(model1, X, y):
    ppreds = model1.predict_proba(X)
    ppreds = ppreds[:,1]
    pscore = roc_auc_score(y, ppreds)
    print('AUC', pscore)

In [6]:
#implement a standard logistic regression model without regularization
lr = LogisticRegression(penalty='none')
lr.fit(X_train, y_train)
evaluate(lr, X_valid, y_valid)

AUC 0.7337101645432886


In [7]:
evaluate(lr, X_test, y_test)

AUC 0.7450438051972218


In [6]:
lr_preds = (lr.predict_proba(X_test))[:,1]
lr_fpr_ssi, lr_tpr_ssi, _ = roc_curve(y_test, lr_preds)
%store lr_fpr_ssi
%store lr_tpr_ssi

Stored 'lr_fpr_ssi' (ndarray)
Stored 'lr_tpr_ssi' (ndarray)


In [7]:
def roc_auc_ci(y_true, y_score, positive=1):
    AUC = roc_auc_score(y_true, y_score)
    N1 = sum(y_true == positive)
    N2 = sum(y_true != positive)
    Q1 = AUC / (2 - AUC)
    Q2 = 2*AUC**2 / (1 + AUC)
    SE_AUC = sqrt((AUC*(1 - AUC) + (N1 - 1)*(Q1 - AUC**2) + (N2 - 1)*(Q2 - AUC**2)) / (N1*N2))
    lower = AUC - 1.96*SE_AUC
    upper = AUC + 1.96*SE_AUC
    if lower < 0:
        lower = 0
    if upper > 1:
        upper = 1
    return (lower, AUC, upper)
roc_auc_ci(y_test, lr_preds)

(0.6689783250938898, 0.6771987409946261, 0.6854191568953624)

In [8]:
lr_prec_ssi, lr_rec_ssi, _ = precision_recall_curve(y_test, lr_preds)
%store lr_prec_ssi
%store lr_rec_ssi

Stored 'lr_prec_ssi' (ndarray)
Stored 'lr_rec_ssi' (ndarray)


In [9]:
def roc_prc_ci(y_true, y_score, positive=1):
    AUC = average_precision_score(y_true, y_score)
    N1 = sum(y_true == positive)
    N2 = sum(y_true != positive)
    Q1 = AUC / (2 - AUC)
    Q2 = 2*AUC**2 / (1 + AUC)
    SE_AUC = sqrt((AUC*(1 - AUC) + (N1 - 1)*(Q1 - AUC**2) + (N2 - 1)*(Q2 - AUC**2)) / (N1*N2))
    lower = AUC - 1.96*SE_AUC
    upper = AUC + 1.96*SE_AUC
    if lower < 0:
        lower = 0
    if upper > 1:
        upper = 1
    return (lower, AUC, upper)
roc_prc_ci(y_test, lr_preds)

(0.18958663836343612, 0.19449505484764085, 0.19940347133184558)

In [10]:
lrpreds = lr.predict_proba(X_test)
lrpreds = lrpreds[:,1]
%store lrpreds

Stored 'lrpreds' (ndarray)


In [11]:
%store -r lrpreds

In [12]:
from sklearn.metrics import recall_score
from imblearn.metrics import specificity_score
thresh = np.arange(0, 1, 0.0005)
%store -r ann_preds
#calculate sensitivity at thresholds
lr_sens = {}
for t in thresh:
    lr_sens[t] = recall_score(y_test, lr_preds > t)
lr_spec = {}
for t in thresh:
    lr_spec[t] = specificity_score(y_test, lr_preds > t)
ann_sens = {}
for t in thresh:
    ann_sens[t] = recall_score(y_test, ann_preds > t)
ann_spec = {}
for t in thresh:
    ann_spec[t] = specificity_score(y_test, ann_preds > t)
def get_senspec(thresh):
    print(lr_sens[thresh], lr_spec[thresh])
    print(ann_sens[thresh], ann_spec[thresh])


In [13]:
get_senspec(0.169)

0.25721198585520194 0.9005819909758702
0.36795086543830263 0.9543126185234431


In [14]:
get_senspec(0.145)

0.3625535082821515 0.8421431218257515
0.45151684347664245 0.9050286636004969


In [15]:
get_senspec(0.111)

0.562069607295738 0.7000675719859625
0.5963149078726968 0.7853390587876278


In [16]:
get_senspec(0.0940)

0.6774613809789689 0.5906663469712492
0.681928159315094 0.6989341064149792


In [17]:
get_senspec(0.0830)

0.7500465289410013 0.49652331233515706
0.7373906569886469 0.629792706584999


In [18]:
get_senspec(0.0665)

0.8648799553322166 0.3157791485929769
0.8196538246789503 0.4992261917736556


In [19]:
get_senspec(0.0655)

0.8717662386004095 0.30246092813392333
0.8265401079471432 0.4902238594502692


In [20]:
get_senspec(0.0460)

0.983249581239531 0.054515334481330516
0.9134561697375768 0.29757830721276457


In [21]:
get_senspec(0.0505)

0.9672436255350828 0.1016849401660963
0.89428624604504 0.34766876648429496


In [22]:
get_senspec(0.0285)

1.0 0.0
0.9739437930392705 0.10334154369291802
