In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import gc
np.random.seed(7)


In [2]:
def bootstrap_auc(y_true, y_pred, n=1000, sample_size=None):
    """
    Generate `n` bootstrap samples, evaluating `func`
    at each resampling. `bootstrap` returns a function,
    which can be called to obtain confidence intervals
    of interest.
    """
    simulations = list()
    if sample_size is None:
        sample_size = len(y_pred)
    num_tried = 0
    while len(simulations) < n and num_tried < 10000:
        num_tried += 1
        iteridx = np.random.choice(range(len(y_pred)), size=sample_size, replace=True)
        iterypred = [np.array(y_pred)[idx] for idx in iteridx]
        iterytrue = [np.array(y_true)[idx] for idx in iteridx]
        if len(set(iterytrue)) < 2:
            continue
        simulations.append(roc_auc_score(iterytrue, iterypred))
    simulations.sort()
    def ci(p):
        """
        Return 2-sided symmetric confidence interval specified
        by p.
        """
        u_pval = (1+p)/2.
        l_pval = (1-u_pval)
        l_indx = int(np.floor(n*l_pval))
        u_indx = int(np.floor(n*u_pval))
        return(np.mean(simulations), np.abs(simulations[l_indx]-simulations[u_indx])/2)
    result = ci(.95)
    return result
    #return('({:.4f}, {:.4f})'.format(result[0], result[1]))
    
def ci(simulations, p, n):
        simulations.sort()
        """
        Return 2-sided symmetric confidence interval specified
        by p.
        """
        u_pval = (1+p)/2.
        l_pval = (1-u_pval)
        l_indx = int(np.floor(n*l_pval))
        u_indx = int(np.floor(n*u_pval))
        return(np.mean(simulations), np.abs(simulations[l_indx]-simulations[u_indx])/2)

## Preprocessing dfs

In [3]:
cxr_dfs = []
for i in [1,2,3,4,5]:
    cxr_dfs.append(pd.read_csv('model_files/cxr_run_{}/cxr_test_result.csv'.format(i)))
cxr_df = pd.concat(cxr_dfs)[['Path', 'Pneumothorax']]
cxr_df.columns = ['Path', 'Pneumothorax_pred']
cxr_df = cxr_df.groupby(cxr_df['Path']).mean()
cxr_test_csv = pd.read_csv('test_dfs/cxr_test_df.csv')
cxr_df = cxr_df.merge(cxr_test_csv, on=['Path'])

nih_dfs = []
for i in [1,2,3,4,5]:
    nih_dfs.append(pd.read_csv('model_files/nih_run_{}/nih_test_result.csv'.format(i)))
nih_df = pd.concat(nih_dfs)[['Path', 'Pneumothorax']]
nih_df.columns = ['Path', 'Pneumothorax_pred']
nih_df = nih_df.groupby(nih_df['Path']).mean()
nih_test_csv = pd.read_csv('test_dfs/nih_test_df.csv')
nih_df = nih_df.merge(nih_test_csv, on=['Path'])
nih_df['study_id'] = nih_df.apply(lambda row: str(row['Patient ID'])+'/'+str(row['Follow-up #']), axis=1)

cxp_dfs = []
for i in [1,2,3,4,5]:
    cxp_dfs.append(pd.read_csv('model_files/cxp_run_{}/cxp_test_result.csv'.format(i)))
cxp_df = pd.concat(cxp_dfs)[['Path', 'Pneumothorax']]
cxp_df.columns = ['Path', 'Pneumothorax_pred']
cxp_df = cxp_df.groupby(cxp_df['Path']).mean()
cxp_test_csv = pd.read_csv('test_dfs/cxp_test_df.csv')
cxp_df = cxp_df.merge(cxp_test_csv, on=['Path'])
cxp_df['study_id'] = cxp_df.apply(lambda row: row['Path'].split('chexpert_full/train/')[1].split('/view')[0], axis=1)


demo_df = pd.read_csv('cxr_demo_df.csv')
cxr_combo_df = cxr_df.merge(demo_df, on='subject_id', how='inner')
cxr_combo_df['ethnicity'] = cxr_combo_df['ethnicity'].apply(lambda x: 'OTHER' if x in ['AMERICAN INDIAN/ALASKA NATIVE', 'UNKNOWN', 'UNABLE TO OBTAIN'] else x)
cxr_combo_df['insurance'] = cxr_combo_df['insurance'].apply(lambda x: 'Other' if x == 'Medicare' else x)
cxr_df = cxr_combo_df[cxr_combo_df['anchor_age'] != 0]

In [4]:
age_intervals = [0, 20, 40, 60, 80]
age_field = 'anchor_age'
cxr_age = {'0-20': np.mean((cxr_df[age_field] > age_intervals[0])&(cxr_df[age_field] <= age_intervals[1])),
           '21-40': np.mean((cxr_df[age_field] > age_intervals[1])&(cxr_df[age_field] <= age_intervals[2])),
           '41-60': np.mean((cxr_df[age_field] > age_intervals[2])&(cxr_df[age_field] <= age_intervals[3])),
           '61-80': np.mean((cxr_df[age_field] > age_intervals[3])&(cxr_df[age_field] <= age_intervals[4])),
           '80+': np.mean((cxr_df[age_field] > age_intervals[4]))}

sex_field = 'gender'
cxr_sex = {'M': np.mean(cxr_df[sex_field]=='M'),
          'F': np.mean(cxr_df[sex_field]=='F')}

In [5]:
age_intervals = [0, 20, 40, 60, 80]
age_field = 'Age'
cxp_age = {'0-20': np.mean((cxp_df[age_field] > age_intervals[0])&(cxp_df[age_field] <= age_intervals[1])),
           '21-40': np.mean((cxp_df[age_field] > age_intervals[1])&(cxp_df[age_field] <= age_intervals[2])),
           '41-60': np.mean((cxp_df[age_field] > age_intervals[2])&(cxp_df[age_field] <= age_intervals[3])),
           '61-80': np.mean((cxp_df[age_field] > age_intervals[3])&(cxp_df[age_field] <= age_intervals[4])),
           '80+': np.mean((cxp_df[age_field] > age_intervals[4]))}

sex_field = 'Sex'
cxp_sex = {'M': np.mean(cxp_df[sex_field]=='Male'),
          'F': np.mean(cxp_df[sex_field]=='Female')}

In [6]:
age_intervals = [0, 20, 40, 60, 80]
age_field = 'Patient Age'
nih_df[age_field] = nih_df[age_field].replace(0, np.nan)
nih_df = nih_df[nih_df[age_field].notnull()]
nih_age = {'0-20': np.mean((nih_df[age_field] > age_intervals[0])&(nih_df[age_field] <= age_intervals[1])),
           '21-40': np.mean((nih_df[age_field] > age_intervals[1])&(nih_df[age_field] <= age_intervals[2])),
           '41-60': np.mean((nih_df[age_field] > age_intervals[2])&(nih_df[age_field] <= age_intervals[3])),
           '61-80': np.mean((nih_df[age_field] > age_intervals[3])&(nih_df[age_field] <= age_intervals[4])),
           '80+': np.mean((nih_df[age_field] > age_intervals[4]))}

sex_field = 'Patient Gender'
nih_sex = {'M': np.mean(nih_df[sex_field]=='M'),
          'F': np.mean(nih_df[sex_field]=='F')}

In [7]:
nih_demo = dict()
for sex_k in nih_sex:
    for age_k in nih_age:
        nih_demo[sex_k+age_k] = nih_age[age_k] * nih_sex[sex_k]
        
cxp_demo = dict()
for sex_k in cxp_sex:
    for age_k in cxp_age:
        cxp_demo[sex_k+age_k] = cxp_age[age_k] * cxp_sex[sex_k]
        
cxr_demo = dict()
for sex_k in cxr_sex:
    for age_k in cxr_age:
        cxr_demo[sex_k+age_k] = cxr_age[age_k] * cxr_sex[sex_k]

In [8]:
# NIH ON CXP
nih_dfs = []
for i in [1,2,3,4,5]:
    nih_dfs.append(pd.read_csv('model_files/nih_run_{}/cxp_test_result.csv'.format(i)))
nih_df = pd.concat(nih_dfs)[['Path', 'Pneumothorax']]
nih_df.columns = ['Path', 'Pneumothorax_pred']
nih_df = nih_df.groupby(nih_df['Path']).mean()
nih_df = nih_df.merge(cxp_test_csv, on=['Path'])
nih_df['study_id'] = nih_df.apply(lambda row: row['Path'].split('chexpert_full/train/')[1].split('/view')[0], axis=1)

nih_on_cxp = nih_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred', 'Sex', 'Age']].groupby('study_id').agg({'Pneumothorax': 'max', 'Pneumothorax_pred': 'mean', 'Age': 'mean', 'Sex': 'first'})
# print('NIH on CXP: {}'.format(bootstrap_auc(nih['Pneumothorax'].values, nih['Pneumothorax_pred'].values)))
# gc.collect()

all_auc_scores = []

for i in range(500):
    ordered_counts = list(np.random.multinomial(len(nih_on_cxp), list([nih_demo[k] for k in nih_demo])))

    frequency_dict = dict()
    for sex_k in nih_sex:
        for age_k in nih_age:
            frequency_dict[sex_k+age_k] = ordered_counts.pop(0)

    sampled_df_all = pd.DataFrame()

    for gender in ['M', 'F']:
        for age in ['0-20', '21-40', '41-60', '61-80', '80+']:
            num_to_sample = frequency_dict[gender+age]
            if age == '0-20':
                age_low = 0
                age_high = 20
            elif age == '21-40':
                age_low = 20
                age_high = 40
            elif age == '41-60':
                age_low = 40
                age_high = 60
            elif age == '61-80':
                age_low = 60
                age_high = 80
            elif age == '80+':
                age_low = 80
                age_high = 10000

            if gender == 'M':
                sex = 'Male'
            elif gender == 'F':
                sex = 'Female'

            df_to_sample = nih_on_cxp[(nih_on_cxp['Age'] > age_low)&(nih_on_cxp['Age'] <= age_high)&(nih_on_cxp['Sex'] == sex)]
            subset = df_to_sample.sample(num_to_sample, replace=True, random_state=i)

            sampled_df_all = sampled_df_all.append(subset)

    all_auc_scores.append(roc_auc_score(sampled_df_all['Pneumothorax'].values, sampled_df_all['Pneumothorax_pred'].values))
    
ci(all_auc_scores, .95, len(all_auc_scores))

(0.7738405055260894, 0.013234151595980903)

In [9]:
# CXR ON CXP
cxr_dfs = []
for i in [1,2,3,4,5]:
    cxr_dfs.append(pd.read_csv('model_files/cxr_run_{}/cxp_test_result.csv'.format(i)))
cxr_df = pd.concat(cxr_dfs)[['Path', 'Pneumothorax']]
cxr_df.columns = ['Path', 'Pneumothorax_pred']
cxr_df = cxr_df.groupby(cxr_df['Path']).mean()
cxr_df = cxr_df.merge(cxp_test_csv, on=['Path'])
cxr_df['study_id'] = cxr_df.apply(lambda row: row['Path'].split('chexpert_full/train/')[1].split('/view')[0], axis=1)

cxr_on_cxp = cxr_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred', 'Sex', 'Age']].groupby('study_id').agg({'Pneumothorax': 'max', 'Pneumothorax_pred': 'mean', 'Age': 'mean', 'Sex': 'first'})
all_auc_scores = []

for i in range(500):
    ordered_counts = list(np.random.multinomial(len(cxr_on_cxp), list([cxr_demo[k] for k in cxr_demo])))

    frequency_dict = dict()
    for sex_k in nih_sex:
        for age_k in nih_age:
            frequency_dict[sex_k+age_k] = ordered_counts.pop(0)

    sampled_df_all = pd.DataFrame()

    for gender in ['M', 'F']:
        for age in ['0-20', '21-40', '41-60', '61-80', '80+']:
            num_to_sample = frequency_dict[gender+age]
            if age == '0-20':
                age_low = 0
                age_high = 20
            elif age == '21-40':
                age_low = 20
                age_high = 40
            elif age == '41-60':
                age_low = 40
                age_high = 60
            elif age == '61-80':
                age_low = 60
                age_high = 80
            elif age == '80+':
                age_low = 80
                age_high = 10000

            if gender == 'M':
                sex = 'Male'
            elif gender == 'F':
                sex = 'Female'

            df_to_sample = cxr_on_cxp[(cxr_on_cxp['Age'] > age_low)&(cxr_on_cxp['Age'] <= age_high)&(cxr_on_cxp['Sex'] == sex)]
            subset = df_to_sample.sample(num_to_sample, replace=True, random_state=i)

            sampled_df_all = sampled_df_all.append(subset)

    all_auc_scores.append(roc_auc_score(sampled_df_all['Pneumothorax'].values, sampled_df_all['Pneumothorax_pred'].values))
ci(all_auc_scores, .95, len(all_auc_scores))

(0.82465302603693, 0.011472885564221769)

In [10]:
# CXP ON CXP

cxp_dfs = []
for i in [1,2,3,4,5]:
    cxp_dfs.append(pd.read_csv('model_files/cxp_run_{}/cxp_test_result.csv'.format(i)))
cxp_df = pd.concat(cxp_dfs)[['Path', 'Pneumothorax']]
cxp_df.columns = ['Path', 'Pneumothorax_pred']
cxp_df = cxp_df.groupby(cxp_df['Path']).mean()
cxp_test_csv = pd.read_csv('test_dfs/cxp_test_df.csv')
cxp_df = cxp_df.merge(cxp_test_csv, on=['Path'])
cxp_df['study_id'] = cxp_df.apply(lambda row: row['Path'].split('chexpert_full/train/')[1].split('/view')[0], axis=1)


all_auc_scores = []

for i in range(500):
    ordered_counts = list(np.random.multinomial(len(cxp_df), list([cxp_demo[k] for k in cxp_demo])))

    frequency_dict = dict()
    for sex_k in nih_sex:
        for age_k in nih_age:
            frequency_dict[sex_k+age_k] = ordered_counts.pop(0)

    sampled_df_all = pd.DataFrame()

    for gender in ['M', 'F']:
        for age in ['0-20', '21-40', '41-60', '61-80', '80+']:
            num_to_sample = frequency_dict[gender+age]
            if age == '0-20':
                age_low = 0
                age_high = 20
            elif age == '21-40':
                age_low = 20
                age_high = 40
            elif age == '41-60':
                age_low = 40
                age_high = 60
            elif age == '61-80':
                age_low = 60
                age_high = 80
            elif age == '80+':
                age_low = 80
                age_high = 10000

            if gender == 'M':
                sex = 'Male'
            elif gender == 'F':
                sex = 'Female'

            df_to_sample = cxp_df[(cxp_df['Age'] > age_low)&(cxp_df['Age'] <= age_high)&(cxp_df['Sex'] == sex)]
            subset = df_to_sample.sample(num_to_sample, replace=True, random_state=i)

            sampled_df_all = sampled_df_all.append(subset)

    all_auc_scores.append(roc_auc_score(sampled_df_all['Pneumothorax'].values, sampled_df_all['Pneumothorax_pred'].values))
ci(all_auc_scores, .95, len(all_auc_scores))

(0.8967860180357291, 0.008869070093852849)

In [11]:
# CXR ON NIH
cxr_dfs = []
for i in [1,2,3,4,5]:
    cxr_dfs.append(pd.read_csv('model_files/cxr_run_{}/nih_test_result.csv'.format(i)))
cxr_df = pd.concat(cxr_dfs)[['Path', 'Pneumothorax']]
cxr_df.columns = ['Path', 'Pneumothorax_pred']
cxr_df = cxr_df.groupby(cxr_df['Path']).mean()
cxr_df = cxr_df.merge(nih_test_csv, on=['Path'])
cxr_df['study_id'] = cxr_df.apply(lambda row: str(row['Patient ID'])+'/'+str(row['Follow-up #']), axis=1)

cxr_on_nih = cxr_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred', 'Patient Gender', 'Patient Age']].groupby('study_id').agg({'Pneumothorax': 'max', 'Pneumothorax_pred': 'mean', 'Patient Age': 'mean', 'Patient Gender': 'first'})
gc.collect()


all_auc_scores = []

for i in range(500):
    ordered_counts = list(np.random.multinomial(len(cxr_on_nih), list([cxr_demo[k] for k in cxr_demo])))

    frequency_dict = dict()
    for sex_k in nih_sex:
        for age_k in nih_age:
            frequency_dict[sex_k+age_k] = ordered_counts.pop(0)

    sampled_df_all = pd.DataFrame()

    for gender in ['M', 'F']:
        for age in ['0-20', '21-40', '41-60', '61-80', '80+']:
            num_to_sample = frequency_dict[gender+age]
            if age == '0-20':
                age_low = 0
                age_high = 20
            elif age == '21-40':
                age_low = 20
                age_high = 40
            elif age == '41-60':
                age_low = 40
                age_high = 60
            elif age == '61-80':
                age_low = 60
                age_high = 80
            elif age == '80+':
                age_low = 80
                age_high = 10000

            if gender == 'M':
                sex = 'M'
            elif gender == 'F':
                sex = 'F'

            df_to_sample = cxr_on_nih[(cxr_on_nih['Patient Age'] > age_low)&(cxr_on_nih['Patient Age'] <= age_high)&(cxr_on_nih['Patient Gender'] == sex)]
            subset = df_to_sample.sample(num_to_sample, replace=True, random_state=i)

            sampled_df_all = sampled_df_all.append(subset)

    all_auc_scores.append(roc_auc_score(sampled_df_all['Pneumothorax'].values, sampled_df_all['Pneumothorax_pred'].values))
ci(all_auc_scores, .95, len(all_auc_scores))

(0.8252491107360613, 0.022392192240955566)

In [12]:
#CXP on NIH

cxp_dfs = []
for i in [1,2,3,4,5]:
    cxp_dfs.append(pd.read_csv('model_files/cxp_run_{}/nih_test_result.csv'.format(i)))
cxp_df = pd.concat(cxp_dfs)[['Path', 'Pneumothorax']]
cxp_df.columns = ['Path', 'Pneumothorax_pred']
cxp_df = cxp_df.groupby(cxp_df['Path']).mean()
cxp_df = cxp_df.merge(nih_test_csv, on=['Path'])
cxp_df['study_id'] = cxp_df.apply(lambda row: str(row['Patient ID'])+'/'+str(row['Follow-up #']), axis=1)

cxp_on_nih = cxp_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred', 'Patient Gender', 'Patient Age']].groupby('study_id').agg({'Pneumothorax': 'max', 'Pneumothorax_pred': 'mean', 'Patient Age': 'mean', 'Patient Gender': 'first'})
gc.collect()

all_auc_scores = []

for i in range(500):
    ordered_counts = list(np.random.multinomial(len(cxp_on_nih), list([cxp_demo[k] for k in cxp_demo])))

    frequency_dict = dict()
    for sex_k in nih_sex:
        for age_k in nih_age:
            frequency_dict[sex_k+age_k] = ordered_counts.pop(0)

    sampled_df_all = pd.DataFrame()

    for gender in ['M', 'F']:
        for age in ['0-20', '21-40', '41-60', '61-80', '80+']:
            num_to_sample = frequency_dict[gender+age]
            if age == '0-20':
                age_low = 0
                age_high = 20
            elif age == '21-40':
                age_low = 20
                age_high = 40
            elif age == '41-60':
                age_low = 40
                age_high = 60
            elif age == '61-80':
                age_low = 60
                age_high = 80
            elif age == '80+':
                age_low = 80
                age_high = 10000

            if gender == 'M':
                sex = 'M'
            elif gender == 'F':
                sex = 'F'

            df_to_sample = cxp_on_nih[(cxp_on_nih['Patient Age'] > age_low)&(cxp_on_nih['Patient Age'] <= age_high)&(cxp_on_nih['Patient Gender'] == sex)]
            subset = df_to_sample.sample(num_to_sample, replace=True, random_state=i)

            sampled_df_all = sampled_df_all.append(subset)

    all_auc_scores.append(roc_auc_score(sampled_df_all['Pneumothorax'].values, sampled_df_all['Pneumothorax_pred'].values))
ci(all_auc_scores, .95, len(all_auc_scores))

(0.8450560075645488, 0.02289977157065276)

In [13]:
#NIH on NIH

nih_dfs = []
for i in [1,2,3,4,5]:
    nih_dfs.append(pd.read_csv('model_files/nih_run_{}/nih_test_result.csv'.format(i)))
nih_df = pd.concat(nih_dfs)[['Path', 'Pneumothorax']]
nih_df.columns = ['Path', 'Pneumothorax_pred']
nih_df = nih_df.groupby(nih_df['Path']).mean()
nih_test_csv = pd.read_csv('test_dfs/nih_test_df.csv')
nih_df = nih_df.merge(nih_test_csv, on=['Path'])
nih_df['study_id'] = nih_df.apply(lambda row: str(row['Patient ID'])+'/'+str(row['Follow-up #']), axis=1)

all_auc_scores = []

for i in range(500):
    ordered_counts = list(np.random.multinomial(len(nih_df), list([nih_demo[k] for k in nih_demo])))

    frequency_dict = dict()
    for sex_k in nih_sex:
        for age_k in nih_age:
            frequency_dict[sex_k+age_k] = ordered_counts.pop(0)

    sampled_df_all = pd.DataFrame()

    for gender in ['M', 'F']:
        for age in ['0-20', '21-40', '41-60', '61-80', '80+']:
            num_to_sample = frequency_dict[gender+age]
            if age == '0-20':
                age_low = 0
                age_high = 20
            elif age == '21-40':
                age_low = 20
                age_high = 40
            elif age == '41-60':
                age_low = 40
                age_high = 60
            elif age == '61-80':
                age_low = 60
                age_high = 80
            elif age == '80+':
                age_low = 80
                age_high = 10000

            if gender == 'M':
                sex = 'M'
            elif gender == 'F':
                sex = 'F'

            df_to_sample = nih_df[(nih_df['Patient Age'] > age_low)&(nih_df['Patient Age'] <= age_high)&(nih_df['Patient Gender'] == sex)]
            subset = df_to_sample.sample(num_to_sample, replace=True, random_state=i)

            sampled_df_all = sampled_df_all.append(subset)

    all_auc_scores.append(roc_auc_score(sampled_df_all['Pneumothorax'].values, sampled_df_all['Pneumothorax_pred'].values))
ci(all_auc_scores, .95, len(all_auc_scores))

(0.8825563142220857, 0.015042236502857187)

In [14]:
#CXP ON CXR
cxp_dfs = []
for i in [1,2,3,4,5]:
    cxp_dfs.append(pd.read_csv('model_files/cxp_run_{}/cxr_test_result.csv'.format(i)))
cxp_df = pd.concat(cxp_dfs)[['Path', 'Pneumothorax']]
cxp_df.columns = ['Path', 'Pneumothorax_pred']
cxp_df = cxp_df.groupby(cxp_df['Path']).mean()
cxp_df = cxp_df.merge(cxr_test_csv, on=['Path'])


demo_df = pd.read_csv('cxr_demo_df.csv')
cxr_combo_df = cxp_df.merge(demo_df, on='subject_id', how='inner')
cxr_combo_df['ethnicity'] = cxr_combo_df['ethnicity'].apply(lambda x: 'OTHER' if x in ['UNKNOWN', 'UNABLE TO OBTAIN'] else x)
cxr_combo_df['insurance'] = cxr_combo_df['insurance'].apply(lambda x: 'Other' if x == 'Medicare' else x)
cxr_df = cxr_combo_df[cxr_combo_df['anchor_age'] != 0]

cxp_on_cxr = cxr_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred', 'gender', 'anchor_age']].groupby('study_id').agg({'Pneumothorax': 'max', 'Pneumothorax_pred': 'mean', 'anchor_age': 'mean', 'gender': 'first'})
gc.collect()

all_auc_scores = []

for i in range(500):
    ordered_counts = list(np.random.multinomial(len(cxp_on_cxr), list([cxp_demo[k] for k in cxp_demo])))

    frequency_dict = dict()
    for sex_k in nih_sex:
        for age_k in nih_age:
            frequency_dict[sex_k+age_k] = ordered_counts.pop(0)

    sampled_df_all = pd.DataFrame()

    for gender in ['M', 'F']:
        for age in ['0-20', '21-40', '41-60', '61-80', '80+']:
            num_to_sample = frequency_dict[gender+age]
            if age == '0-20':
                age_low = 0
                age_high = 20
            elif age == '21-40':
                age_low = 20
                age_high = 40
            elif age == '41-60':
                age_low = 40
                age_high = 60
            elif age == '61-80':
                age_low = 60
                age_high = 80
            elif age == '80+':
                age_low = 80
                age_high = 10000

            if gender == 'M':
                sex = 'M'
            elif gender == 'F':
                sex = 'F'

            df_to_sample = cxp_on_cxr[(cxp_on_cxr['anchor_age'] > age_low)&(cxp_on_cxr['anchor_age'] <= age_high)&(cxp_on_cxr['gender'] == sex)]
            subset = df_to_sample.sample(num_to_sample, replace=True, random_state=i)

            sampled_df_all = sampled_df_all.append(subset)

    all_auc_scores.append(roc_auc_score(sampled_df_all['Pneumothorax'].values, sampled_df_all['Pneumothorax_pred'].values))
ci(all_auc_scores, .95, len(all_auc_scores))

(0.8711835511930189, 0.013202912375821496)

In [15]:
#CXR on CXR

cxr_dfs = []
for i in [1,2,3,4,5]:
    cxr_dfs.append(pd.read_csv('model_files/cxr_run_{}/cxr_test_result.csv'.format(i)))
cxr_df = pd.concat(cxr_dfs)[['Path', 'Pneumothorax']]
cxr_df.columns = ['Path', 'Pneumothorax_pred']
cxr_df = cxr_df.groupby(cxr_df['Path']).mean()
cxr_test_csv = pd.read_csv('test_dfs/cxr_test_df.csv')
cxr_df = cxr_df.merge(cxr_test_csv, on=['Path'])

demo_df = pd.read_csv('test_dfs/cxr_demo_df.csv')
cxr_combo_df = cxr_df.merge(demo_df, on='subject_id', how='inner')
cxr_combo_df['ethnicity'] = cxr_combo_df['ethnicity'].apply(lambda x: 'OTHER' if x in ['AMERICAN INDIAN/ALASKA NATIVE', 'UNKNOWN', 'UNABLE TO OBTAIN'] else x)
cxr_combo_df['insurance'] = cxr_combo_df['insurance'].apply(lambda x: 'Other' if x == 'Medicare' else x)
cxr_df = cxr_combo_df[cxr_combo_df['anchor_age'] != 0]

all_auc_scores = []

for i in range(500):
    ordered_counts = list(np.random.multinomial(len(cxr_df), list([cxr_demo[k] for k in cxr_demo])))

    frequency_dict = dict()
    for sex_k in nih_sex:
        for age_k in nih_age:
            frequency_dict[sex_k+age_k] = ordered_counts.pop(0)

    sampled_df_all = pd.DataFrame()

    for gender in ['M', 'F']:
        for age in ['0-20', '21-40', '41-60', '61-80', '80+']:
            num_to_sample = frequency_dict[gender+age]
            if age == '0-20':
                age_low = 0
                age_high = 20
            elif age == '21-40':
                age_low = 20
                age_high = 40
            elif age == '41-60':
                age_low = 40
                age_high = 60
            elif age == '61-80':
                age_low = 60
                age_high = 80
            elif age == '80+':
                age_low = 80
                age_high = 10000

            if gender == 'M':
                sex = 'M'
            elif gender == 'F':
                sex = 'F'

            df_to_sample = cxr_df[(cxr_df['anchor_age'] > age_low)&(cxr_df['anchor_age'] <= age_high)&(cxr_df['gender'] == sex)]
            subset = df_to_sample.sample(num_to_sample, replace=True, random_state=i)

            sampled_df_all = sampled_df_all.append(subset)

    all_auc_scores.append(roc_auc_score(sampled_df_all['Pneumothorax'].values, sampled_df_all['Pneumothorax_pred'].values))
ci(all_auc_scores, .95, len(all_auc_scores))

(0.8903461350658666, 0.0032649729195563526)

In [16]:
# NIH ON CXR
nih_dfs = []
for i in [1,2,3,4,5]:
    nih_dfs.append(pd.read_csv('model_files/nih_run_{}/cxr_test_result.csv'.format(i)))
nih_df = pd.concat(nih_dfs)[['Path', 'Pneumothorax']]
nih_df.columns = ['Path', 'Pneumothorax_pred']
nih_df = nih_df.groupby(nih_df['Path']).mean()
nih_df = nih_df.merge(cxr_test_csv, on=['Path'])

demo_df = pd.read_csv('test_dfs/cxr_demo_df.csv')
cxr_combo_df = nih_df.merge(demo_df, on='subject_id', how='inner')
cxr_combo_df['ethnicity'] = cxr_combo_df['ethnicity'].apply(lambda x: 'OTHER' if x in ['AMERICAN INDIAN/ALASKA NATIVE', 'UNKNOWN', 'UNABLE TO OBTAIN'] else x)
cxr_combo_df['insurance'] = cxr_combo_df['insurance'].apply(lambda x: 'Other' if x == 'Medicare' else x)
cxr_df = cxr_combo_df[cxr_combo_df['anchor_age'] != 0]

nih_on_cxr = cxr_df[['Path', 'study_id', 'Pneumothorax', 'Pneumothorax_pred', 'gender', 'anchor_age']].groupby('study_id').agg({'Pneumothorax': 'max', 'Pneumothorax_pred': 'mean', 'anchor_age': 'mean', 'gender': 'first'})
gc.collect()


all_auc_scores = []

for i in range(500):
    ordered_counts = list(np.random.multinomial(len(nih_on_cxr), list([nih_demo[k] for k in nih_demo])))

    frequency_dict = dict()
    for sex_k in nih_sex:
        for age_k in nih_age:
            frequency_dict[sex_k+age_k] = ordered_counts.pop(0)

    sampled_df_all = pd.DataFrame()

    for gender in ['M', 'F']:
        for age in ['0-20', '21-40', '41-60', '61-80', '80+']:
            num_to_sample = frequency_dict[gender+age]
            if age == '0-20':
                age_low = 0
                age_high = 20
            elif age == '21-40':
                age_low = 20
                age_high = 40
            elif age == '41-60':
                age_low = 40
                age_high = 60
            elif age == '61-80':
                age_low = 60
                age_high = 80
            elif age == '80+':
                age_low = 80
                age_high = 10000

            if gender == 'M':
                sex = 'M'
            elif gender == 'F':
                sex = 'F'

            df_to_sample = nih_on_cxr[(nih_on_cxr['anchor_age'] > age_low)&(nih_on_cxr['anchor_age'] <= age_high)&(nih_on_cxr['gender'] == sex)]
            subset = df_to_sample.sample(num_to_sample, replace=True, random_state=i)

            sampled_df_all = sampled_df_all.append(subset)

    all_auc_scores.append(roc_auc_score(sampled_df_all['Pneumothorax'].values, sampled_df_all['Pneumothorax_pred'].values))
ci(all_auc_scores, .95, len(all_auc_scores))

(0.7820362328649448, 0.01430275473276632)