In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from tqdm import tqdm

## Bootstrap Function

In [2]:

def bootstrap_stats(y_true, y_prob1, y_prob2, B=10000):
    results = {}
    results['auc1'] = roc_auc_score(y_true, y_prob1)
    results['auc2'] = roc_auc_score(y_true, y_prob2)
    results['auc_diff'] = results['auc2'] - results['auc1']
    aucs1 = []
    aucs2 = []
    auc_diffs = []
    results['f1_score1'] = f1_score(y_true, y_prob1 > 0.5)
    results['f1_score2'] = f1_score(y_true, y_prob2 > 0.5)
    results['f1_diff'] = results['f1_score2'] - results['f1_score1']
    f1_scores1 = []
    f1_scores2 = []
    f1_diffs = []

    # Run the bootstrap
    for _ in tqdm(range(B)):
        idx = np.random.choice(range(len(y_true)), len(y_true), replace=True)
        auc1 = roc_auc_score(y_true[idx], y_prob1[idx])
        auc2 = roc_auc_score(y_true[idx], y_prob2[idx])
        f1_1 = f1_score(y_true[idx], y_prob1[idx] > 0.5)
        f1_2 = f1_score(y_true[idx], y_prob2[idx] > 0.5)
        aucs1.append(auc1)
        aucs2.append(auc2)
        auc_diffs.append(auc2 - auc1)
        f1_scores1.append(f1_1)
        f1_scores2.append(f1_2)
        f1_diffs.append(f1_2 - f1_1)
    
    # Compute confidence intervals
    aucs1 = np.array(aucs1)
    aucs2 = np.array(aucs2)
    auc_diffs = np.array(auc_diffs)
    results['auc_diff_ci'] = np.percentile(auc_diffs, [2.5, 97.5])
    if results['auc2'] > results['auc1']:
        results['auc_diff_p_value'] = (auc_diffs < 0).mean()
    else:
        results['auc_diff_p_value'] = (auc_diffs > 0).mean()
    results['auc1_ci'] = np.percentile(aucs1, [2.5, 97.5])
    results['auc2_ci'] = np.percentile(aucs2, [2.5, 97.5])
    f1_scores1 = np.array(f1_scores1)
    f1_scores2 = np.array(f1_scores2)
    f1_diffs = np.array(f1_diffs)
    results['f1_diff_ci'] = np.percentile(f1_diffs, [2.5, 97.5])
    if results['f1_score2'] > results['f1_score1']:
        results['f1_diff_p_value'] = (f1_diffs < 0).mean()
    else:
        results['f1_diff_p_value'] = (f1_diffs > 0).mean()
    results['f1_score1_ci'] = np.percentile(f1_scores1, [2.5, 97.5])
    results['f1_score2_ci'] = np.percentile(f1_scores2, [2.5, 97.5])
    return results

## Load BRSet Labels

In [3]:
brset_embed = pd.read_csv('embeddings.csv') # From Embeddings archive
brset_split = pd.read_csv('split.csv') # generated from resplit_data.ipynb

# Load the true values
y_test = np.array(brset_embed[brset_split['split'] == 'test']['DR_2'])
y_test_embed = np.array(brset_embed[brset_split['embeddings_split'] == 'test']['DR_2'])

## Note Analysis - All columns vs Patient History

In [4]:
xgb_all_columns_test_probs = np.load('probs/xgb_all_columns_test_probs.npy')
xgb_pt_history_test_probs = np.load('probs/xgb_pt_history_test_probs.npy')

results = bootstrap_stats(y_test, xgb_all_columns_test_probs, xgb_pt_history_test_probs, B=10000)
results

100%|██████████| 10000/10000 [00:37<00:00, 267.26it/s]


{'auc1': 0.9769691825582075,
 'auc2': 0.8511669173024821,
 'auc_diff': -0.12580226525572547,
 'f1_score1': 0.8648648648648649,
 'f1_score2': 0.3486842105263158,
 'f1_diff': -0.5161806543385491,
 'auc_diff_ci': array([-0.14976049, -0.1026009 ]),
 'auc_diff_p_value': 0.0,
 'auc1_ci': array([0.96567581, 0.98658303]),
 'auc2_ci': array([0.82568875, 0.87555933]),
 'f1_diff_ci': array([-0.59164667, -0.43990956]),
 'f1_diff_p_value': 0.0,
 'f1_score1_ci': array([0.82653061, 0.89908257]),
 'f1_score2_ci': array([0.27857143, 0.41640425])}