In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, accuracy_score, auc, precision_recall_curve, average_precision_score
from math import sqrt
import pickle

In [2]:
data = pd.read_csv(r'/home/kchen/Documents/ureterinjury/procol_train.csv', index_col='CASEID')

In [3]:
include = ['CaseID', 'SEX', 'RACE_NEW', 'ETHNICITY_HISPANIC', 'PRNCPTX', 'CPT', 'WORKRVU', 'Age', 'ANESTHES', 'ANETIME', 'SURGSPEC', 'ELECTSURG', 'EMERGNCY', 'WNDCLAS', 'ASACLAS', 'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD', 'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'PRSEPIS', 'HEIGHT', 'WEIGHT', 'PRSODM', 'PRBUN', 'PRCREAT', 'PRALBUM', 'PRBILI', 'PRSGOT', 'PRALKPH', 'PRWBC', 'PRHCT', 'PRPLATE', 'PRPTT', 'PRINR', 'PRPT', 'DOpertoD', 'OUPNEUMO', 'CDARREST', 'CDMI', 'SUPINFEC', 'WNDINFD', 'ORGSPCSSI', 'URNINFEC', 'OTHDVT', 'PULEMBOL', 'RENAINSF', 'OPRENAFL', 'DEHIS', 'REINTUB', 'FAILWEAN', 'CNSCVA', 'OTHSYSEP', 'OTHBLEED', 'OTHSESHOCK', 'SEPSHOCKPATOS', 'OTHERCPT1', 'OTHERCPT2', 'OTHERCPT3', 'OTHERCPT4', 'OTHERCPT5', 'OTHERCPT6', 'OTHERCPT7', 'OTHERCPT8', 'OTHERCPT9', 'OTHERCPT10', 'CONCPT1', 'CONCPT2', 'CONCPT3', 'CONCPT4', 'CONCPT5', 'CONCPT6', 'CONCPT7', 'CONCPT8', 'CONCPT9', 'CONCPT10', 'INOUT', 'TRANST', 'DPRNA', 'DPRBUN', 'DPRCREAT', 'DPRALBUM', 'DPRBILI', 'DPRSGOT', 'DPRALKPH', 'DPRWBC', 'DPRHCT', 'DPRPLATE', 'DPRPTT', 'DPRPT', 'DPRINR', 'OperYR', 'OPTIME', 'HtoODay', 'SSSIPATOS', 'DSSIPATOS', 'OSSIPATOS', 'PNAPATOS', 'VENTPATOS', 'UTIPATOS', 'SEPSISPATOS', 'SEPSHOCKPATOS', 'OPTIME', 'DRENAINSF','DOPRENAFL','DISCHDEST','READMISSION1','READMPODAYS1', 'READMSUSPREASON1', 'READMUNRELSUSP1', 'READMRELICD91', 'READMRELICD101', 'READMUNRELICD91', 'READMUNRELICD101','REOPERATION1','RETURNOR','TOTHLOS','DEHIS','STILLINHOSP','OTHCDIFF']

In [5]:
y = data['URETER']
X = data.drop(['URETER'], axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [6]:
def evaluate(model1, X, y):
    ppreds = model1.predict_proba(X)
    ppreds = ppreds[:,1]
    pscore = roc_auc_score(y, ppreds)
    print('AUC', pscore)

In [7]:
model3 = LogisticRegression(penalty='none', max_iter=1000)
model3.fit(X_train, y_train)
evaluate(model3, X_valid, y_valid)

AUC 0.7024442961953608


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
lr = 'lr.sav'
pickle.dump(model3, open(lr, 'wb'))

In [8]:
test = pd.read_csv(r'/home/kchen/Documents/ureterinjury/procol_test.csv', index_col='CASEID')


In [9]:
y_test = test['URETER']
X_test = test.drop(['URETER'], axis=1)

In [10]:
evaluate(model3, X_test, y_test)

AUC 0.7208118808666005


In [11]:
lr_preds = (model3.predict_proba(X_test))[:,1]
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_preds)
%store lr_fpr
%store lr_tpr

Stored 'lr_fpr' (ndarray)
Stored 'lr_tpr' (ndarray)


In [12]:
lr_prec, lr_rec, _ = precision_recall_curve(y_test, lr_preds)
%store lr_prec
%store lr_rec

Stored 'lr_prec' (ndarray)
Stored 'lr_rec' (ndarray)


In [13]:
def roc_auc_ci(y_true, y_score, positive=1):
    AUC = roc_auc_score(y_true, y_score)
    N1 = sum(y_true == positive)
    N2 = sum(y_true != positive)
    Q1 = AUC / (2 - AUC)
    Q2 = 2*AUC**2 / (1 + AUC)
    SE_AUC = sqrt((AUC*(1 - AUC) + (N1 - 1)*(Q1 - AUC**2) + (N2 - 1)*(Q2 - AUC**2)) / (N1*N2))
    lower = AUC - 1.96*SE_AUC
    upper = AUC + 1.96*SE_AUC
    if lower < 0:
        lower = 0
    if upper > 1:
        upper = 1
    return (lower, AUC, upper)
roc_auc_ci(y_test, lr_preds)

(0.7084795116598942, 0.740818989792998, 0.7731584679261018)

In [14]:
def roc_prc_ci(y_true, y_score, positive=1):
    AUC = average_precision_score(y_true, y_score)
    N1 = sum(y_true == positive)
    N2 = sum(y_true != positive)
    Q1 = AUC / (2 - AUC)
    Q2 = 2*AUC**2 / (1 + AUC)
    SE_AUC = sqrt((AUC*(1 - AUC) + (N1 - 1)*(Q1 - AUC**2) + (N2 - 1)*(Q2 - AUC**2)) / (N1*N2))
    lower = AUC - 1.96*SE_AUC
    upper = AUC + 1.96*SE_AUC
    if lower < 0:
        lower = 0
    if upper > 1:
        upper = 1
    return (lower, AUC, upper)
roc_prc_ci(y_test, lr_preds)

(0.01909561312532762, 0.021649154469852334, 0.024202695814377048)

In [62]:
lrpreds = model3.predict_proba(X_test)
lrpreds10 = lrpreds[:,1]

In [63]:
lrpreds10[lrpreds10 >= 0.006] = 1
lrpreds10[lrpreds10 < 0.006] = 0
tn, fp, fn, tp = confusion_matrix(y_test, lrpreds10).ravel()
(tn, fp, fn, tp)


(35153, 15796, 106, 195)

In [64]:
print("sensitivity = ")
print(tp / (tp + fn))
print("specificity = ")
print(tn / (tn + fp))
print('accuracy = ')
accuracy_score(y_test, lrpreds10)

sensitivity = 
0.6478405315614618
specificity = 
0.6899644742781997
accuracy = 


0.6897170731707317

In [31]:
accuracy_score(y_test, lrpreds10)


0.9937560975609756

In [32]:
lrpred = lr.predict(X_test)

In [33]:
accuracy_score(y_test, lrpred)


0.9941268292682927

In [34]:
lrpreds10[lrpreds10 >= 0.08] = 1
lrpreds10[lrpreds10 < 0.08] = 0
tn, fp, fn, tp = confusion_matrix(y_test, lrpreds10).ravel()
(tn, fp, fn, tp)

(50928, 21, 299, 2)