In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from tqdm import tqdm
from sklearn.metrics import precision_recall_curve
# suppress RunTimeWarning
import warnings
# warnings.filterwarnings("ignore", category=RuntimeWarning)

## Bootstrap Function

In [15]:

def get_optimal_f1_threshold(y_true, y_pred):
    epsilon = 1e-10
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    f1 = 2 * precision * recall / (precision + recall + epsilon)
    return thresholds[np.argmax(f1)]

def bootstrap_stats(y_true, y_prob1, y_prob2, bootstrap_samples=10000):
    y_true = np.array(y_true)
    results = {}
    results['auc1'] = roc_auc_score(y_true, y_prob1)
    results['auc2'] = roc_auc_score(y_true, y_prob2)
    results['auc_diff'] = results['auc2'] - results['auc1']
    aucs1 = []
    aucs2 = []
    auc_diffs = []
    thresholds1 = get_optimal_f1_threshold(y_true, y_prob1)
    thresholds2 = get_optimal_f1_threshold(y_true, y_prob2)
    results['f1_score1'] = f1_score(y_true, y_prob1 > thresholds1)
    results['f1_score2'] = f1_score(y_true, y_prob2 > thresholds2)
    results['f1_diff'] = results['f1_score2'] - results['f1_score1']
    f1_scores1 = []
    f1_scores2 = []
    f1_diffs = []

    # Run the bootstrap
    for _ in tqdm(range(bootstrap_samples)):
        idx = np.random.choice(range(len(y_true)), len(y_true), replace=True)
        auc1 = roc_auc_score(y_true[idx], y_prob1[idx])
        auc2 = roc_auc_score(y_true[idx], y_prob2[idx])
        thresholds1 = get_optimal_f1_threshold(y_true[idx], y_prob1[idx])
        thresholds2 = get_optimal_f1_threshold(y_true[idx], y_prob2[idx])
        f1_1 = f1_score(y_true[idx], y_prob1[idx] > thresholds1)
        f1_2 = f1_score(y_true[idx], y_prob2[idx] > thresholds2)
        aucs1.append(auc1)
        aucs2.append(auc2)
        auc_diffs.append(auc2 - auc1)
        f1_scores1.append(f1_1)
        f1_scores2.append(f1_2)
        f1_diffs.append(f1_2 - f1_1)
    
    # Compute confidence intervals
    aucs1 = np.array(aucs1)
    aucs2 = np.array(aucs2)
    auc_diffs = np.array(auc_diffs)
    results['auc_diff_ci'] = np.percentile(auc_diffs, [2.5, 97.5])
    if results['auc2'] > results['auc1']:
        results['auc_diff_p_value'] = (auc_diffs < 0).mean()
    else:
        results['auc_diff_p_value'] = (auc_diffs > 0).mean()
    results['auc1_ci'] = np.percentile(aucs1, [2.5, 97.5])
    results['auc2_ci'] = np.percentile(aucs2, [2.5, 97.5])
    f1_scores1 = np.array(f1_scores1)
    f1_scores2 = np.array(f1_scores2)
    f1_diffs = np.array(f1_diffs)
    results['f1_diff_ci'] = np.percentile(f1_diffs, [2.5, 97.5])
    if results['f1_score2'] > results['f1_score1']:
        results['f1_diff_p_value'] = (f1_diffs < 0).mean()
    else:
        results['f1_diff_p_value'] = (f1_diffs > 0).mean()
    results['f1_score1_ci'] = np.percentile(f1_scores1, [2.5, 97.5])
    results['f1_score2_ci'] = np.percentile(f1_scores2, [2.5, 97.5])
    return results

## Load BRSet Labels

In [3]:
brset_embed = pd.read_csv('embeddings.csv') # From Embeddings archive
brset_split = pd.read_csv('split.csv') # generated from resplit_data.ipynb

# Load the true values
y_test = np.array(brset_embed[brset_split['split'] == 'test']['DR_2'])
y_test_embed = np.array(brset_embed[brset_split['embeddings_split'] == 'test']['DR_2'])

In [19]:
model_summary_list = []
difference_summary_list = []

def add_model_line(model_name, summary_list, results, idx):
    summary_list.append({
        'model': model_name,
        'auc': results[f'auc{idx}'],
        'auc_ci' : results[f'auc{idx}_ci'],
        # 'auc_ci_lower': results[f'auc{idx}_ci'][0],
        # 'auc_ci_upper': results[f'auc{idx}_ci'][1],
        'f1_score': results[f'f1_score{idx}'],
        'f1_score_ci': results[f'f1_score{idx}_ci'],
        # 'f1_score_ci_lower': results[f'f1_score{idx}_ci'][0],
        # 'f1_score_ci_upper': results[f'f1_score{idx}_ci'][1]
    })

def add_difference_line(model1_name, model2_name, summary_list, results):
    summary_list.append({
        'model1': model1_name,
        'model1_auc': results['auc1'],
        'model1_f1_score': results['f1_score1'],
        'model2': model2_name,
        'model2_auc': results['auc2'],
        'model2_f1_score': results['f1_score2'],
        'auc_diff': results['auc_diff'],
        'auc_diff_ci': results['auc_diff_ci'],
        # 'auc_diff_ci_lower': results['auc_diff_ci'][0],
        # 'auc_diff_ci_upper': results['auc_diff_ci'][1],
        'auc_diff_p_value': results['auc_diff_p_value'],
        'f1_diff': results['f1_diff'],
        'f1_diff_ci': results['f1_diff_ci'],
        # 'f1_diff_ci_lower': results['f1_diff_ci'][0],
        # 'f1_diff_ci_upper': results['f1_diff_ci'][1],
        'f1_diff_p_value': results['f1_diff_p_value']
    })

## Note Analysis - All columns vs Patient History
The all columns represents using all the data that was used to generate the text embeddings data, while the history columns are limited to the columns representing patient history.  The difference between these two represents the effect that adding in a clinical examination of the eye by an ophthalmologist.

In [13]:
bootstrap_samples = 1000 # Number of bootstrap samples to run; 10000 for final results

In [16]:
xgb_all_columns_test_probs = np.load('probs/xgb_all_columns_test_probs.npy')
xgb_pt_history_test_probs = np.load('probs/xgb_pt_history_test_probs.npy')

results = bootstrap_stats(y_test, xgb_all_columns_test_probs, xgb_pt_history_test_probs, bootstrap_samples=bootstrap_samples)
add_model_line('XGBoost Complete Data', model_summary_list, results, 1)
add_model_line('XGBoost Patient History', model_summary_list, results, 2)
add_difference_line('XGBoost Complete Data', 'XGBoost Patient History', difference_summary_list, results)

100%|██████████| 1000/1000 [00:03<00:00, 266.13it/s]


## Summary of Results

In [17]:
model_summary_df = pd.DataFrame(model_summary_list, columns=['model', 'auc',
                                                                'auc_ci', 
                                                            #  'auc_ci_lower', 'auc_ci_upper', 
                                                             'f1_score',
                                                                'f1_score_ci' 
                                                            #  'f1_score_ci_lower', 'f1_score_ci_upper'
                                                             ])
difference_summary_df = pd.DataFrame(difference_summary_list, columns=['model1', 'model1_auc', 'model1_f1_score', 
                                                                       'model2', 'model2_auc', 'model2_f1_score', 
                                                                       'auc_diff', 
                                                                       'auc_diff_ci', 
                                                                    #    'auc_diff_ci_lower', 'auc_diff_ci_upper', 
                                                                       'auc_diff_p_value',  
                                                                       'f1_diff', 
                                                                       'f1_diff_ci'
                                                                       #'f1_diff_ci_lower', 'f1_diff_ci_upper', 
                                                                       'f1_diff_p_value'])

In [18]:
model_summary_df

Unnamed: 0,model,auc,auc_ci_lower,auc_ci_upper,f1_score,f1_score_ci_lower,f1_score_ci_upper
0,XGBoost Complete Data,0.976969,0.96606,0.986529,0.862069,0.824742,0.895522
1,XGBoost Patient History,0.851167,0.825734,0.875598,0.413953,0.35593,0.47185
2,XGBoost Complete Data,0.976969,0.966047,0.986354,0.862069,0.826775,0.895443
3,XGBoost Patient History,0.851167,0.826143,0.874934,0.413953,0.355449,0.4762


In [12]:
difference_summary_df

Unnamed: 0,model1,model1_auc,model1_f1_score,model2,model2_auc,model2_f1_score,auc_diff,auc_diff_ci_lower,auc_diff_ci_upper,auc_diff_p_value,f1_diff,f1_diff_ci_lower,f1_diff_ci_upper,f1_diff_p_value
0,XGBoost Complete Data,0.976969,0.862069,XGBoost Patient History,0.851167,0.413953,-0.125802,-0.149992,-0.102462,0.0,-0.448115,-0.51386,-0.380871,0.0
