In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
import numpy as np
import gc
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(7)

In [2]:
def bootstrap_auc(y_true, y_pred, n=1000, sample_size=645):
    """
    Generate `n` bootstrap samples, evaluating `func`
    at each resampling. `bootstrap` returns a function,
    which can be called to obtain confidence intervals
    of interest.
    """
    simulations = list()
    if sample_size is None:
        sample_size = len(y_pred)
    num_tried = 0
    while len(simulations) < n and num_tried < 10000:
        num_tried += 1
        iteridx = np.random.choice(range(len(y_pred)), size=sample_size, replace=True)
        iterypred = [np.array(y_pred)[idx] for idx in iteridx]
        iterytrue = [np.array(y_true)[idx] for idx in iteridx]
        if len(set(iterytrue)) < 2:
            continue
        simulations.append(roc_auc_score(iterytrue, iterypred))
    simulations.sort()
    def ci(p):
        """
        Return 2-sided symmetric confidence interval specified
        by p.
        """
        u_pval = (1+p)/2.
        l_pval = (1-u_pval)
        l_indx = int(np.floor(n*l_pval))
        u_indx = int(np.floor(n*u_pval))
        return(np.mean(simulations), np.abs(simulations[l_indx]-simulations[u_indx])/2)
    result = ci(.95)
    return result
    

# Calculating AUC per split

In [3]:
splits = []
for i in [1,2,3,4,5]:
    print("Split {}".format(i))
    cxr_df = pd.read_csv('model_files/cxr_run_{}/cxr_test_result.csv'.format(i))
    cxr_csv = pd.read_csv('test_dfs/cxr_test_df.csv')
    
    nih_df = pd.read_csv('model_files/nih_run_{}/nih_test_result.csv'.format(i))
    
    nih_csv = pd.read_csv('test_dfs/nih_test_df.csv')
    
    cxp_df = pd.read_csv('model_files/cxp_run_{}/cxp_test_result.csv'.format(i))

    cxp_csv = pd.read_csv('test_dfs/cxp_test_df.csv')

    cxr_df = cxr_df[['Path', 'Pneumothorax']]
    cxr_df.columns = ['Path', 'Pneumothorax_pred']
    cxr_df = cxr_df.merge(cxr_csv, on=['Path'])
    nih_df = nih_df[['Path', 'Pneumothorax']]
    nih_df.columns = ['Path', 'Pneumothorax_pred']
    nih_df = nih_df.merge(nih_csv, on=['Path'])
    nih_df['study_id'] = nih_df.apply(lambda row: str(row['Patient ID'])+'/'+str(row['Follow-up #']), axis=1)

    cxp_df = cxp_df[['Path', 'Pneumothorax']]
    cxp_df.columns = ['Path', 'Pneumothorax_pred']
    cxp_df = cxp_df.merge(cxp_csv, on=['Path'])
    cxp_df['study_id'] = cxp_df.apply(lambda row: row['Path'].split('chexpert_full/train/')[1].split('/view')[0], axis=1)

    cxr = cxr_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').max()[['Pneumothorax', 'Pneumothorax_pred']]
    auc = roc_auc_score(cxr['Pneumothorax'].values, cxr['Pneumothorax_pred'].values)
    print('CXR: {}'.format(auc))
    
    cxp = cxp_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').max()[['Pneumothorax', 'Pneumothorax_pred']]
    auc = roc_auc_score(cxp['Pneumothorax'].values, cxp['Pneumothorax_pred'].values)
    print('CXP: {}'.format(auc))

    nih = nih_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').max()[['Pneumothorax', 'Pneumothorax_pred']]
    auc = roc_auc_score(nih['Pneumothorax'].values, nih['Pneumothorax_pred'].values)
    print('NIH: {}'.format(auc))
    
    

Split 1
CXR: 0.864419814351297
CXP: 0.891499280680008
NIH: 0.8579875047033739
Split 2
CXR: 0.8764873350427249
CXP: 0.8856284955508448
NIH: 0.8711152091433588
Split 3
CXR: 0.8741205857933095
CXP: 0.8890909060288147
NIH: 0.8657632595948828
Split 4
CXR: 0.8870090185635829
CXP: 0.8856849520652942
NIH: 0.865126442367992
Split 5
CXR: 0.8802113024492033
CXP: 0.8903737245356232
NIH: 0.8671685885174966


# Preprocessing study level scores

In [4]:
cxr_dfs = []
for i in [1,2,3,4,5]:
    cxr_dfs.append(pd.read_csv('model_files/cxr_run_{}/cxr_test_result.csv'.format(i)))
cxr_df = pd.concat(cxr_dfs)[['Path', 'Pneumothorax']]
cxr_df.columns = ['Path', 'Pneumothorax_pred']
cxr_df = cxr_df.groupby(cxr_df['Path']).mean()
cxr_test_csv = pd.read_csv('test_dfs/cxr_test_df.csv')
cxr_df = cxr_df.merge(cxr_test_csv, on=['Path'])

nih_dfs = []
for i in [1,2,3,4,5]:
    nih_dfs.append(pd.read_csv('model_files/nih_run_{}/nih_test_result.csv'.format(i)))
nih_df = pd.concat(nih_dfs)[['Path', 'Pneumothorax']]
nih_df.columns = ['Path', 'Pneumothorax_pred']
nih_df = nih_df.groupby(nih_df['Path']).mean()
nih_test_csv = pd.read_csv('test_dfs/nih_test_df.csv')
nih_df = nih_df.merge(nih_test_csv, on=['Path'])
nih_df['study_id'] = nih_df.apply(lambda row: str(row['Patient ID'])+'/'+str(row['Follow-up #']), axis=1)

cxp_dfs = []
for i in [1,2,3,4,5]:
    cxp_dfs.append(pd.read_csv('model_files/cxp_run_{}/cxp_test_result.csv'.format(i)))
cxp_df = pd.concat(cxp_dfs)[['Path', 'Pneumothorax']]
cxp_df.columns = ['Path', 'Pneumothorax_pred']
cxp_df = cxp_df.groupby(cxp_df['Path']).mean()
cxp_test_csv = pd.read_csv('test_dfs/cxp_test_df.csv')
cxp_df = cxp_df.merge(cxp_test_csv, on=['Path'])
cxp_df['study_id'] = cxp_df.apply(lambda row: row['Path'].split('chexpert_full/train/')[1].split('/view')[0], axis=1)


In [5]:
cxr = cxr_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').mean()[['Pneumothorax', 'Pneumothorax_pred']]
print('CXR auc: {}'.format(bootstrap_auc(cxr['Pneumothorax'].values, cxr['Pneumothorax_pred'].values)))
gc.collect()

CXR auc: (0.8926802371302883, 0.05923445424286372)


20

In [6]:
cxp = cxp_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').mean()[['Pneumothorax', 'Pneumothorax_pred']]
print('CXP auc: {}'.format(bootstrap_auc(cxp['Pneumothorax'].values, cxp['Pneumothorax_pred'].values)))
gc.collect()

CXP auc: (0.9045383472881069, 0.047747722041507346)


0

In [7]:
nih = nih_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').mean()[['Pneumothorax', 'Pneumothorax_pred']]
print('NIH auc: {}'.format(bootstrap_auc(nih['Pneumothorax'].values, nih['Pneumothorax_pred'].values)))
gc.collect()

NIH auc: (0.8853182174067162, 0.06215625331532337)


0

In [8]:
# CXR ON NIH
cxr_dfs = []
for i in [1,2,3,4,5]:
    cxr_dfs.append(pd.read_csv('model_files/cxr_run_{}/nih_test_result.csv'.format(i)))
cxr_df = pd.concat(cxr_dfs)[['Path', 'Pneumothorax']]
cxr_df.columns = ['Path', 'Pneumothorax_pred']
cxr_df = cxr_df.groupby(cxr_df['Path']).mean()
cxr_df = cxr_df.merge(nih_test_csv, on=['Path'])
cxr_df['study_id'] = cxr_df.apply(lambda row: str(row['Patient ID'])+'/'+str(row['Follow-up #']), axis=1)

# CXR ON CXP
cxr = cxr_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').mean()[['Pneumothorax', 'Pneumothorax_pred']]
print('CXR on NIH auc: {}'.format(bootstrap_auc(cxr['Pneumothorax'].values, cxr['Pneumothorax_pred'].values)))
gc.collect()

cxr_dfs = []
for i in [1,2,3,4,5]:
    cxr_dfs.append(pd.read_csv('model_files/cxr_run_{}/cxp_test_result.csv'.format(i)))
cxr_df = pd.concat(cxr_dfs)[['Path', 'Pneumothorax']]
cxr_df.columns = ['Path', 'Pneumothorax_pred']
cxr_df = cxr_df.groupby(cxr_df['Path']).mean()
cxr_df = cxr_df.merge(cxp_test_csv, on=['Path'])
cxr_df['study_id'] = cxr_df.apply(lambda row: row['Path'].split('chexpert_full/train/')[1].split('/view')[0], axis=1)

cxr = cxr_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').mean()[['Pneumothorax', 'Pneumothorax_pred']]
print('CXR on CXP auc: {}'.format(bootstrap_auc(cxr['Pneumothorax'].values, cxr['Pneumothorax_pred'].values)))


CXR on NIH auc: (0.8381883132047001, 0.08997141791972163)
CXR on CXP auc: (0.8250992386155683, 0.06709021819327676)


In [9]:
# NIH ON CXP
nih_dfs = []
for i in [2,3,4,5]:
    nih_dfs.append(pd.read_csv('model_files/nih_run_{}/cxp_test_result.csv'.format(i)))
nih_df = pd.concat(nih_dfs)[['Path', 'Pneumothorax']]
nih_df.columns = ['Path', 'Pneumothorax_pred']
nih_df = nih_df.groupby(nih_df['Path']).mean()
nih_df = nih_df.merge(cxp_test_csv, on=['Path'])
nih_df['study_id'] = nih_df.apply(lambda row: row['Path'].split('chexpert_full/train/')[1].split('/view')[0], axis=1)

nih = nih_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').mean()[['Pneumothorax', 'Pneumothorax_pred']]
print('NIH on CXP auc: {}'.format(bootstrap_auc(nih['Pneumothorax'].values, nih['Pneumothorax_pred'].values)))

gc.collect()
# NIH ON CXR
nih_dfs = []
for i in [2,3,4,5]:
    nih_dfs.append(pd.read_csv('model_files/nih_run_{}/cxr_test_result.csv'.format(i)))
nih_df = pd.concat(nih_dfs)[['Path', 'Pneumothorax']]
nih_df.columns = ['Path', 'Pneumothorax_pred']
nih_df = nih_df.groupby(nih_df['Path']).mean()
nih_df = nih_df.merge(cxr_test_csv, on=['Path'])

nih = nih_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').mean()[['Pneumothorax', 'Pneumothorax_pred']]
print('NIH on CXR auc:  {}'.format(bootstrap_auc(nih['Pneumothorax'].values, nih['Pneumothorax_pred'].values)))


NIH on CXP auc: (0.78150872288462, 0.07182757914328475)
NIH on CXR auc:  (0.767338063461262, 0.09447236505491546)


In [10]:
#CXP ON CXR
cxp_dfs = []
for i in [1,2,3,4,5]:
    cxp_dfs.append(pd.read_csv('model_files/cxp_run_{}/cxr_test_result.csv'.format(i)))
cxp_df = pd.concat(cxp_dfs)[['Path', 'Pneumothorax']]
cxp_df.columns = ['Path', 'Pneumothorax_pred']
cxp_df = cxp_df.groupby(cxp_df['Path']).mean()
cxp_df = cxp_df.merge(cxr_test_csv, on=['Path'])

cxp = cxp_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').mean()[['Pneumothorax', 'Pneumothorax_pred']]
print('CXP on CXR auc: {}'.format(bootstrap_auc(cxp['Pneumothorax'].values, cxp['Pneumothorax_pred'].values)))

gc.collect()
#CXP on NIH
cxp_dfs = []
for i in [1,2,3,4,5]:
    cxp_dfs.append(pd.read_csv('model_files/cxp_run_{}/nih_test_result.csv'.format(i)))
cxp_df = pd.concat(cxp_dfs)[['Path', 'Pneumothorax']]
cxp_df.columns = ['Path', 'Pneumothorax_pred']
cxp_df = cxp_df.groupby(cxp_df['Path']).mean()
cxp_df = cxp_df.merge(nih_test_csv, on=['Path'])
cxp_df['study_id'] = cxp_df.apply(lambda row: str(row['Patient ID'])+'/'+str(row['Follow-up #']), axis=1)

cxp = cxp_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred']].groupby('study_id').mean()[['Pneumothorax', 'Pneumothorax_pred']]
print('CXP on NIH auc: {}'.format(bootstrap_auc(cxp['Pneumothorax'].values, cxp['Pneumothorax_pred'].values)))



CXP on CXR auc: (0.8690971493762342, 0.07759713408262608)
CXP on NIH auc: (0.8537688410155057, 0.08230548983937658)


In [11]:
(0.048+0.067+0.072+0.078+0.060+0.094+0.082+0.090+0.0620)/9

0.07255555555555555