## This is an entirely self-contained notebook comparing reproducing our evaluatiob from our [CTO](https://chufangao.github.io/CTOD/) label predictions tested on the manually annotated split

[1] Gao, C., Pradeepkumar, J., Das, T., Thati, S., & Sun, J. (2024). Automatically Labeling Clinical Trial Outcomes: A Large-Scale Benchmark for Drug Development. arXiv preprint arXiv:2406.10292.

In [1]:
# # ================ First, let us get started by cloning everyting in ================
!git clone https://github.com/chufangao/CTOD.git
!git clone https://github.com/futianfan/clinical-trial-outcome-prediction.git
!wget https://huggingface.co/datasets/chufangao/CTO/resolve/main/CTTI.zip
CTTI_PATH = './CTTI.zip'

Cloning into 'CTOD'...
remote: Enumerating objects: 512, done.[K
remote: Counting objects: 100% (281/281), done.[K
remote: Compressing objects: 100% (202/202), done.[K
remote: Total 512 (delta 176), reused 147 (delta 76), pack-reused 231 (from 1)[K
Receiving objects: 100% (512/512), 34.12 MiB | 4.93 MiB/s, done.
Resolving deltas: 100% (276/276), done.
Cloning into 'clinical-trial-outcome-prediction'...
remote: Enumerating objects: 932, done.[K
remote: Counting objects: 100% (69/69), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 932 (delta 19), reused 40 (delta 5), pack-reused 863 (from 1)[K
Receiving objects: 100% (932/932), 104.38 MiB | 7.75 MiB/s, done.
Resolving deltas: 100% (534/534), done.
Updating files: 100% (119/119), done.
--2025-02-24 01:01:41--  https://huggingface.co/datasets/chufangao/CTO/resolve/main/CTTI.zip
Resolving huggingface.co (huggingface.co)... 18.244.202.68, 18.244.202.60, 18.244.202.73, ...
Connecting to huggingface.co (huggin

In [None]:
# # if you want to use the latest version of clinical trials instead, uncomment and run this cell
# !pip install selenium
# !python ./CTOD/download_ctti.py
# CTTI_PATH = './downloads/CTTI_new.zip'

In [7]:
# ================ building text features from CTTI ================
import glob
import os
import pandas as pd
import numpy as np
import zipfile

def load_all_studies_with_features(CTTI_PATH):
    with zipfile.ZipFile(CTTI_PATH, 'r') as zip_ref:
        names = zip_ref.namelist()
        all_studies = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='studies.txt'][0]), sep='|')
        diseases = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='browse_conditions.txt'][0]), sep='|')
        interventions = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='interventions.txt'][0]), sep='|')
        criteria = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='eligibilities.txt'][0]), sep='|')
        designs = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='designs.txt'][0]), sep='|')
    # diseases = pd.read_csv(os.path.join(CTTI_PATH, 'browse_conditions.txt'), sep='|')
    diseases = diseases.groupby('nct_id')['downcase_mesh_term'].apply(lambda x: ' '.join(list(x))).reset_index().rename(columns={'downcase_mesh_term': 'diseases'})
    diseases.fillna('', inplace=True)

    # interventions = pd.read_csv(os.path.join(CTTI_PATH, 'interventions.txt'), sep='|')
    interventions = interventions.dropna(subset=['name'])
    interventions['name'] = interventions['name'].str.lower()
    interventions = interventions.groupby('nct_id')['name'].apply(lambda x: ' '.join(list(x))).reset_index().rename(columns={'name': 'interventions'})
    interventions.fillna('', inplace=True)

    # criteria = pd.read_csv(os.path.join(CTTI_PATH, 'eligibilities.txt'), sep='|')[['nct_id', 'criteria']]
    criteria = criteria.dropna(subset=['criteria'])
    criteria.drop_duplicates(subset=['nct_id'], inplace=True)
    criteria['criteria'] = criteria['criteria'].str.lower()
    criteria.fillna('', inplace=True)

    # designs = pd.read_csv(os.path.join(CTTI_PATH, 'designs.txt'), sep='|')
    designs = designs.fillna('')
    designs['design'] = designs['allocation'] + ' ' + designs['intervention_model'] + ' ' + designs['observational_model'] + ' ' + designs['primary_purpose'] + ' ' + designs['time_perspective'] + ' ' + designs['masking']
    designs['design'] = designs['design'].str.lower()
    designs = designs[['nct_id', 'design']]
    designs.drop_duplicates(subset=['nct_id'], inplace=True)
    designs.fillna('', inplace=True)

    # all_studies = pd.read_csv(os.path.join(CTTI_PATH, 'studies.txt'), sep='|')
    all_studies.dropna(subset=['completion_date'], inplace=True)
    all_studies['year'] = all_studies['completion_date'].apply(lambda x: int(x.split('-')[0]))

    all_studies = all_studies.merge(diseases, on='nct_id', how='left')
    all_studies = all_studies.merge(interventions, on='nct_id', how='left')
    all_studies = all_studies.merge(criteria, on='nct_id', how='left')
    all_studies = all_studies.merge(designs, on='nct_id', how='left')
    all_studies['features'] = all_studies['phase'] + ' '  + all_studies['diseases'] + ' '  + all_studies['interventions'] + ' ' + all_studies['design'] + ' ' + all_studies['criteria']
    all_studies = all_studies[all_studies['features'].str.len() > 0]
    return all_studies

In [13]:
# ================ processing CTO predictions ================
import pandas as pd
import sys
import zipfile

def lf_status(path):
    with zipfile.ZipFile(CTTI_PATH, 'r') as zip_ref:
        names = zip_ref.namelist()
        df = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='studies.txt'][0]), sep='|')
    df['lf'] = -1
    # lower case all status and replace '_' with ' '
    df['overall_status'] = df['overall_status'].str.lower().str.replace('_', ' ')
    df.loc[df['overall_status'].isin(['terminated', 'withdrawn', 'suspended', 'withheld', 'no longer available', 'temporarily not available']),['lf']] = 0
    df.loc[df['overall_status'].isin(['approved for marketing']),['lf']] = 1
    df['lf'] = df['lf'].fillna(-1).astype('int')
    return df

def lf_pvalues(path): # any p-value sig is good
    with zipfile.ZipFile(CTTI_PATH, 'r') as zip_ref:
        names = zip_ref.namelist()
        df = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='outcome_analyses.txt'][0]), sep='|')
        outcomes_df = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='outcomes.txt'][0]), sep='|')
    primary_outcomes = outcomes_df[outcomes_df['outcome_type'].str.lower()=='primary']
    df = df[df['outcome_id'].isin(primary_outcomes['id'])]
    df.dropna(subset=['p_value'], inplace=True)

    df['lf'] = df['p_value'] < .05
    df = df.groupby('nct_id')[['lf', 'p_value']].mean().reset_index() # multiple pvalues per nct_id
    df['lf'] = df['p_value'] > 0 # any p-value sig is good
    df['lf'] = df['lf'].fillna(-1).astype('int')
    return df


CTO_GOLD_PATH="https://huggingface.co/datasets/chufangao/CTO/raw/main/human_labels_2020_2024/human_labels_2020_2024.csv"
# CTTI_PATH="./CTTI_new/"
CTO_phase1_preds = pd.read_csv("https://huggingface.co/datasets/chufangao/CTO/raw/main/phase1_CTO_rf.csv")
CTO_phase2_preds = pd.read_csv("https://huggingface.co/datasets/chufangao/CTO/raw/main/phase2_CTO_rf.csv")
CTO_phase3_preds = pd.read_csv("https://huggingface.co/datasets/chufangao/CTO/raw/main/phase3_CTO_rf.csv")
labelmodel_label = pd.concat([CTO_phase1_preds, CTO_phase2_preds, CTO_phase3_preds])
labelmodel_label = labelmodel_label[['nct_id', 'pred', 'pred_proba']].drop_duplicates(subset=['nct_id'])
# print(labelmodel_label['pred'].value_counts())

cto_gold = pd.read_csv(CTO_GOLD_PATH)
# print(cto_gold)
cto_gold = cto_gold[cto_gold['labels'] != -1]
print(cto_gold['labels'].value_counts())

labelmodel_label = labelmodel_label[~labelmodel_label['nct_id'].isin(cto_gold['nct_id'])]
# print(labelmodel_label['pred'].value_counts())

# ======== get features =========
all_studies = load_all_studies_with_features(CTTI_PATH)
# ========== get train and test split ==========
train_studies = all_studies[all_studies['nct_id'].isin(labelmodel_label['nct_id'])]
train_studies['label'] = -1

status_lf = lf_status(path=CTTI_PATH)
status_lf = status_lf[status_lf['lf']!=-1]
status_mapping = status_lf[['nct_id', 'lf']].set_index('nct_id').to_dict()['lf']
train_studies['label'] = train_studies.apply(lambda x: status_mapping[x['nct_id']] if x['nct_id'] in status_mapping else x['label'], axis=1)
print(train_studies['label'].value_counts())

# apply pvalue mapping where pvalue < 0.05 (conservative, some studies may not have pvalues but are still positive)
pvalue_lf = lf_pvalues(path=CTTI_PATH)
pvalue_lf = pvalue_lf[pvalue_lf['lf']==1]
pvalue_mapping = pvalue_lf[['nct_id', 'lf']].set_index('nct_id').to_dict()['lf']
# map lf == -1 to pvalue lf
train_studies.loc[train_studies['label']==-1, 'label'] = train_studies.loc[train_studies['label']==-1].apply(lambda x: pvalue_mapping[x['nct_id']] if x['nct_id'] in pvalue_mapping else x['label'], axis=1)
print(train_studies['label'].value_counts())

# apply labelmodel mapping where studies['label'] == -1
labelmodel_mapping = labelmodel_label.set_index('nct_id').to_dict()['pred_proba']
train_studies.loc[train_studies['label']==-1, 'label'] = train_studies.loc[train_studies['label']==-1].apply(lambda x: labelmodel_mapping[x['nct_id']] if x['nct_id'] in labelmodel_mapping else x['label'], axis=1)
print(train_studies['label'].value_counts())
# print(train_studies['label'].value_counts())

test_studies = all_studies[all_studies['nct_id'].isin(cto_gold['nct_id'])]
gold_mapping = cto_gold.set_index('nct_id').to_dict()['labels']
test_studies['label'] = test_studies['nct_id'].apply(lambda x: gold_mapping[x] if x in gold_mapping else -1)
# print(test_studies['label'].value_counts())

# save for later
train_studies.to_csv('../train_studies.csv', index=False)
test_studies.to_csv('../test_studies.csv', index=False)



            nct_id  nlm_download_date_description study_first_submitted_date  \
0      NCT01236547                            NaN                 2010-11-05   
1      NCT03277586                            NaN                 2017-09-01   
2      NCT03757715                            NaN                 2018-11-27   
3      NCT05272813                            NaN                 2022-02-18   
4      NCT05260541                            NaN                 2022-02-18   
...            ...                            ...                        ...   
11007  NCT03630770                            NaN                 2018-07-25   
11008  NCT05621525                            NaN                 2022-11-10   
11009  NCT03642132                            NaN                 2018-07-13   
11010  NCT03631641                            NaN                 2018-08-13   
11011  NCT02543749                            NaN                 2015-08-25   

      results_first_submitted_date disp

  all_studies = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='studies.txt'][0]), sep='|')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_studies['label'] = -1
  df = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='studies.txt'][0]), sep='|')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_studies['label'] = train_studies.apply(lambda x: status_mapping[x['nct_id']] if x['nct_id'] in status_mapping else x['label'], axis=1)


label
-1    65031
 0    11856
Name: count, dtype: int64


  df = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='outcome_analyses.txt'][0]), sep='|')


label
-1    58407
 0    11856
 1     6624
Name: count, dtype: int64


  train_studies.loc[train_studies['label']==-1, 'label'] = train_studies.loc[train_studies['label']==-1].apply(lambda x: labelmodel_mapping[x['nct_id']] if x['nct_id'] in labelmodel_mapping else x['label'], axis=1)


label
0.000000    11856
1.000000     6624
0.747575     2179
0.752107     1522
0.743648     1457
            ...  
0.613112        1
0.657456        1
0.719844        1
0.656409        1
0.708336        1
Name: count, Length: 6346, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_studies['label'] = test_studies['nct_id'].apply(lambda x: gold_mapping[x] if x in gold_mapping else -1)


In [17]:
# ================ processing TOP training data ================
import pandas as pd
import numpy as np
import os
import glob
HINT_PATH = './clinical-trial-outcome-prediction/data'

hint_train = glob.glob(os.path.join(HINT_PATH, "phase*train.csv")) + glob.glob(os.path.join(HINT_PATH, "phase*valid.csv")) + glob.glob(os.path.join(HINT_PATH, "phase*test.csv"))
hint_train = pd.concat((pd.read_csv(f) for f in hint_train))
hint_train.drop_duplicates(subset=['nctid'], inplace=True)
hint_train_mapping = hint_train.set_index('nctid').to_dict()['label']
# all_studies = load_all_studies_with_features(CTTI_PATH)
hint_train_studies = all_studies[all_studies['nct_id'].isin(hint_train['nctid'])]
hint_train_studies['label'] = hint_train_studies['nct_id'].apply(lambda x: hint_train_mapping[x] if x in hint_train_mapping else -1)

# hint_train_studies = hint_train_studies[hint_train_studies['year'] <= 2020]
print(hint_train_studies['label'].value_counts())
hint_train_studies.to_csv('hint_train_studies.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hint_train_studies['label'] = hint_train_studies['nct_id'].apply(lambda x: hint_train_mapping[x] if x in hint_train_mapping else -1)


label
1    6129
0    4471
Name: count, dtype: int64


In [26]:
# ================ running baselines ================
import numpy as np
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import pandas as pd
import sklearn
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE, ADASYN
import warnings
import copy
warnings.filterwarnings("ignore")

def bootstrap_eval(y_true, y_pred, y_prob, num_samples=100):
    f1s = []
    aps = []
    rocs = []
    for _ in range(num_samples):
        indices = np.random.choice(len(y_true), len(y_true), replace=True)
        # convert to multiclass for precision recall curve
        y_true_multi = np.zeros((len(y_true), 2))
        y_true_multi[np.arange(len(y_true)), y_true] = 1
        y_pred_multi = np.zeros((len(y_pred), 2))
        y_pred_multi[np.arange(len(y_pred)), y_pred] = 1
        y_prob_multi = np.zeros((len(y_prob), 2))
        y_prob_multi[np.arange(len(y_prob)), y_pred] = y_prob

        # accs.append(np.mean(y_true[indices] == y_pred[indices]))
        f1s.append(f1_score(y_true_multi[indices], y_pred_multi[indices], average='weighted'))
        aps.append(average_precision_score(y_true_multi[indices], y_prob_multi[indices], average='weighted'))
        rocs.append(roc_auc_score(y_true_multi[indices], y_pred_multi[indices], average='weighted'))
    return np.mean(f1s), np.std(f1s), np.mean(aps), np.std(aps), np.mean(rocs), np.std(rocs)

def get_conditions_mesh(CTTI_PATH, human_labels_path, top_k=10):
    human_labels = pd.read_csv(human_labels_path)
    with zipfile.ZipFile(CTTI_PATH, 'r') as zip_ref:
        names = zip_ref.namelist()
        conditions_df = pd.read_csv(zip_ref.open([name for name in names if name.split("/")[-1]=='browse_conditions.txt'][0]), sep='|')

    # conditions_df = pd.read_csv(os.path.join(CTTI_PATH, 'browse_conditions.txt'), sep='|', low_memory=False)
    conditions_df = conditions_df[conditions_df['mesh_type'] == 'mesh-ancestor']
    conditions_df_copy = copy.deepcopy(conditions_df.copy())
    conditions_df = conditions_df[conditions_df['nct_id'].isin(set(human_labels['nct_id']))]

    top_term = [conditions_df['downcase_mesh_term'].value_counts().index[0]]
    for k in range(top_k):
        nct_ids = set(conditions_df[conditions_df['downcase_mesh_term'].str.contains(top_term[-1])]['nct_id'].unique())
        conditions_df = conditions_df[~conditions_df['nct_id'].isin(nct_ids)]
        # print('remaining', conditions_df['downcase_mesh_term'].value_counts())
        top_term.append(conditions_df['downcase_mesh_term'].value_counts().index[0])

    mesh_dict =  {top_term[i]: set(conditions_df_copy[conditions_df_copy['downcase_mesh_term'].str.contains(top_term[i])]['nct_id'].unique())
                  for i in range(len(top_term))}

    return mesh_dict

output = []
# train_mode_years = [('TOP', 2021), ('manual', 2021), ('CTO', 2021),
#                     ('TOP', 2022), ('manual', 2022), ('CTO', 2022),
#                     ('TOP', 2023), ('manual', 2023), ('CTO', 2023),
#                     ('TOP', 2024), ('manual', 2024), ('CTO', 2024)]
# train_mode_years = [('TOP', 'manual_2022'),]
train_mode_years = [('CTO', 'manual_2022'),('manual', 'manual_2022'), ('TOP', 'manual_2022')]
mesh_dict = get_conditions_mesh(CTTI_PATH, human_labels_path="https://huggingface.co/datasets/chufangao/CTO/raw/main/human_labels_2020_2024/human_labels_2020_2024.csv")
disease = ''

for train_mode, year in train_mode_years:
    if train_mode == 'TOP':
        test_studies = pd.read_csv('../test_studies.csv')
    # train_studies, test_studies = test_studies[test_studies['year']<2022], test_studies[test_studies['year']>=2022]
        # train_studies = test_studies[(test_studies['year']<year) & (test_studies['year']>=year-1)]
        # train_studies = test_studies[(test_studies['year']<year)]
        if type(year) == str and year=='manual_2022':
            test_studies = test_studies[(test_studies['year']>=2022)]
        elif type(year) is int:
            test_studies = test_studies[(test_studies['year']>=year) & (test_studies['year']<year+1)]
        train_studies = hint_train_studies.copy()
    elif train_mode == 'manual':
        test_studies = pd.read_csv('../test_studies.csv')
        if type(year) == str and year=='manual_2022':
            train_studies = test_studies[(test_studies['year']<2022)]
            test_studies = test_studies[(test_studies['year']>=2022)]
        elif type(year) is int:
            train_studies = test_studies[(test_studies['year']<year)]
            test_studies = test_studies[(test_studies['year']>=year) & (test_studies['year']<year+1)]
    elif train_mode == 'CTO':
        train_studies = pd.read_csv('../train_studies.csv')
        test_studies = pd.read_csv('../test_studies.csv')
        if type(year) == str and year=='manual_2022':
            train_studies = train_studies[(train_studies['year']<2022)]
            test_studies = test_studies[(test_studies['year']>=2022)]
        elif type(year) is int:
            train_studies = train_studies[(train_studies['year']<year) & (train_studies['year']>=2015)]
            test_studies = test_studies[(test_studies['year']>=year) & (test_studies['year']<year+1)]
            # train_studies = train_studies[(train_studies['year']<year) & (train_studies['year']>=year-1)]

    # ========== uncomment and unindent to run on all diseases ==========
    # org_train_studies = train_studies.copy()
    # org_test_studies = test_studies.copy()
    # for disease, disease_ncts in mesh_dict.items():
    #     train_studies = org_train_studies[org_train_studies['nct_id'].isin(disease_ncts)]
    #     test_studies = org_test_studies[org_test_studies['nct_id'].isin(disease_ncts)]
    #     print(disease, train_studies.shape, test_studies.shape, org_train_studies.shape, org_test_studies.shape, len(disease_ncts))

    # ========== get tfidf features ==========
    tfidf = TfidfVectorizer(max_features=2048, stop_words='english')
    X_train = tfidf.fit_transform(train_studies['features'])
    y_train = (train_studies['label'].values > 0.75).astype(int)
    # y_train = train_studies['label'].values

    X_test = tfidf.transform(test_studies['features'])
    y_test = (test_studies['label'].values > .75).astype(int)

    print('y_train', np.unique(y_train, return_counts=True))
    print('y_test', np.unique(y_test, return_counts=True))

    # ========== train models ==========
    # for model_name in ['RF', 'LR','XGBoost', 'MLP',  'SVM', ]:
    for model_name in ['RF', ]:
        # for phase in ['1', '2', '3']:
        for phase in ['all']:
            if model_name == 'RF':
                model = RandomForestClassifier(n_estimators=300, random_state=0, max_depth=8, n_jobs=4)
            elif model_name == 'LR':
                model = LogisticRegression(max_iter=2000, random_state=0, penalty='l2')
            elif model_name == 'SVM':
                # model = LinearSVC(dual="auto", max_iter=10000, random_state=0)
                model = CalibratedClassifierCV(model)
                # model = SVC(kernel='linear', probability=True, random_state=0)
            elif model_name == 'XGBoost':
                model = XGBClassifier(n_estimators=1000, random_state=0, max_depth=8, n_jobs=4)
            elif model_name == 'MLP':
                model = MLPClassifier(hidden_layer_sizes=(64, 64), max_iter=2000, random_state=0)
            else:
                raise ValueError('Unknown model name')

            if phase == 'all':
                train_phase_mask = np.ones(len(train_studies)).astype(bool)
                test_phase_mask = np.ones(len(test_studies)).astype(bool)
            else:
                train_phase_mask = train_studies['phase'].str.lower().str.contains(phase)
                train_phase_mask=train_phase_mask.values
                test_phase_mask = test_studies['phase'].str.lower().str.contains(phase)
                test_phase_mask=test_phase_mask.values

            # print('phase train', np.unique(y_train[train_phase_mask], return_counts=True))
            # print('phase test', np.unique(y_test[test_phase_mask], return_counts=True))

            X_train_, y_train_ = SMOTE().fit_resample(X_train[train_phase_mask], y_train[train_phase_mask])
            # print('phase train after smote', np.unique(y_train_, return_counts=True))
            model.fit(X_train_, y_train_)
            # model.fit(X_train[train_phase_mask], y_train[train_phase_mask])
            pred = model.predict(X_test)
            # print('phase pred', np.unique(pred[test_phase_mask], return_counts=True))
            prob = model.predict_proba(X_test)[:, 1]

            # if model_name == 'XGBoost':
            test_studies.loc[test_phase_mask, ['pred']] = pred[test_phase_mask]
            test_studies.loc[test_phase_mask, ['prob']] = prob[test_phase_mask]

            # print(phase, test_df_subset.shape)
            # print(classification_report(test_df_subset['label'], test_df_subset['pred']))
            f1_mean, f1_std, ap_mean, ap_std, roc_mean, roc_std = bootstrap_eval(y_test[test_phase_mask], pred[test_phase_mask], prob[test_phase_mask])
            print(f"{disease}, {train_mode}, {year}, {phase}, {model_name}, {f1_mean:.3f}, {f1_std:.3f}, {ap_mean:.3f}, {ap_std:.3f}, {roc_mean:.3f}, {roc_std:.3f}")
            output.append([disease, train_mode, year, phase, model_name, f1_mean, f1_std, ap_mean, ap_std, roc_mean, roc_std])
            # print(f"{train_mode}, {year}, {phase}, {model_name}, {f1_mean:.3f}, {f1_std:.3f}, {ap_mean:.3f}, {ap_std:.3f}, {roc_mean:.3f}, {roc_std:.3f}")
            # output.append([train_mode, year, phase, model_name, f1_mean, f1_std, ap_mean, ap_std, roc_mean, roc_std])
            # break
# output = pd.DataFrame(output, columns=['year', 'phase', 'model', 'f1_mean', 'f1_std', 'ap_mean', 'ap_std', 'roc_mean', 'roc_std'])
# output = pd.DataFrame(output, columns=['train_mode', 'year', 'phase', 'model', 'f1_mean', 'f1_std', 'ap_mean', 'ap_std', 'roc_mean', 'roc_std'])
output = pd.DataFrame(output, columns=['disease', 'train_mode', 'year', 'phase', 'model', 'f1_mean', 'f1_std', 'ap_mean', 'ap_std', 'roc_mean', 'roc_std'])

y_train (array([0, 1]), array([47775, 20805]))
y_test (array([0, 1]), array([4005, 1612]))
, CTO, manual_2022, all, RF, 0.609, 0.006, 0.622, 0.007, 0.570, 0.008
y_train (array([0, 1]), array([2851, 1672]))
y_test (array([0, 1]), array([4005, 1612]))
, manual, manual_2022, all, RF, 0.684, 0.007, 0.646, 0.008, 0.638, 0.007
y_train (array([0, 1]), array([4471, 6129]))
y_test (array([0, 1]), array([4005, 1612]))
, TOP, manual_2022, all, RF, 0.519, 0.007, 0.637, 0.007, 0.560, 0.008


In [28]:
output

Unnamed: 0,disease,train_mode,year,phase,model,f1_mean,f1_std,ap_mean,ap_std,roc_mean,roc_std
0,,CTO,manual_2022,all,RF,0.608683,0.006123,0.621591,0.007278,0.570048,0.007551
1,,manual,manual_2022,all,RF,0.683974,0.00655,0.645574,0.007569,0.637799,0.007225
2,,TOP,manual_2022,all,RF,0.519071,0.007407,0.636569,0.006862,0.560489,0.007648
