In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, average_precision_score, roc_auc_score, f1_score, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import issparse
import joblib

In [None]:
# read in input files
real_prot = pd.read_csv('./measured_proteomics_random.csv')
syn_prot = pd.read_csv('./RABIT_proteomics_random.csv')
ehr_rep = pd.read_csv('ehr_representations_random.csv')
ehr_rep.drop('labeling_time', axis=1, inplace=True)
master_labels = pd.read_csv('./master_labels_random.csv') ## CHANGED LABELS TO MATCH AGING CLOCK
pred_time = pd.read_csv('./pred_time_random.csv')

In [None]:
# reformat to match column names
real_prot.rename(columns={'eid': 'patient_ids'}, inplace=True)
syn_prot.rename(columns={'sample_eid': 'patient_ids'}, inplace=True)

In [None]:
# join pred times to master_labels
label_final = pd.merge(pred_time[['patient_id', 'prediction_time']], master_labels, left_on='patient_id', right_on='person_id', how='inner')
label_final.drop(columns=['person_id'], inplace=True)
label_final


In [None]:
# create ehr+RABIT_proteomics and ehr+measured_proteomics dataframes
motor_real_prot = ehr_rep.merge(real_prot, on="patient_ids", how="inner")
motor_syn_prot = ehr_rep.merge(syn_prot, on="patient_ids", how="inner")

In [None]:
def process_dataframe(df, timeint, colnames):
    result_df = pd.DataFrame()
    result_df['patient_id'] = df['patient_id']
    
    # strip time from prediction_time and convert to datetime
    df['prediction_date'] = pd.to_datetime(df['prediction_time']).dt.date
    
    # add timeint to prediction_time to create horizontime
    df['horizontime'] = df['prediction_date'] + timedelta(days=timeint)
    
    # loop over each column in colnames
    for col in colnames:
        result_df[col] = None  
        df[col] = pd.to_datetime(df[col], errors='coerce').dt.date
        
        # remove patients with diagnosis prior to prediction time. label 1 if diagnosis is within 5 years of prediction time, else 0.
        result_df[col] = df.apply(
            lambda row: 'remove' if pd.notnull(row[col]) and row[col] < row['prediction_date']
            else 1 if pd.notnull(row[col]) and row['prediction_date'] <= row[col] <= row['horizontime']
            else 0, axis=1
        )

    result_df.rename(columns={'patient_id': 'patient_ids'}, inplace=True)
    return result_df

colnames = [col for col in master_labels.columns if col not in ['person_id']]
timeint = 1827  # 5 years

input_labels = process_dataframe(label_final, timeint, colnames)
input_labels

In [None]:
# see number of cases
count_ones = input_labels[input_labels.columns].apply(lambda col: (col == 1).sum())
count_remove = input_labels[input_labels.columns].apply(lambda col: (col == 'remove').sum())

print(count_ones,'\n')
print(count_remove)

# remove columns where count_ones is 0 (excluding 'patient_ids')
columns_to_remove = count_ones[count_ones == 0].index.tolist()
if 'patient_ids' in columns_to_remove:
    columns_to_remove.remove('patient_ids')

# drop the columns from input_labels
input_labels = input_labels.drop(columns=columns_to_remove)

# return the updated input_labels
input_labels

In [None]:
def multi_task_cv_xgboost_dense(input_data, label_data, output_dir, timehorizon, positive_ratio=0.25):
    # create the timehorizon subdirectory within output_dir
    timehorizon_dir = os.path.join(output_dir, timehorizon)
    os.makedirs(timehorizon_dir, exist_ok=True)

    # merge the input and label dataframes on 'patient_ids'
    df_merged = input_data.merge(label_data, on='patient_ids')
    patient_ids = df_merged['patient_ids']
    results = []
    predvalues = []
    featimp_dict = {}  
    
    # random number generation with fixed seed
    rng = np.random.RandomState(42)

    # loop through each disease in label_data (excluding 'patient_ids')
    for task in tqdm(label_data.columns.drop('patient_ids'), desc="Tasks"):

        # create a subdirectory for the disease
        task_dir = os.path.join(timehorizon_dir, task.replace(" ", "_"))
        os.makedirs(task_dir, exist_ok=True)

        # remove rows where label is "remove" for the current disease
        df_task = df_merged[df_merged[task] != "remove"]
        X = df_task.drop(columns=['patient_ids'] + label_data.columns.tolist())  # Features
        y = df_task[task].astype(int)  # Current task label, converted to integer

        # get filtered patient IDs
        task_patient_ids = df_task['patient_ids']

        # 3 fold CV
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        all_y_true = []
        all_y_pred = []
        all_y_pred_proba = []
        featimp_per_task = []

        for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
            # data split
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            test_patient_ids = task_patient_ids.iloc[test_index]

            # Undersample negative class to achieve 10% minimum prevalence
            pos_mask = y_train == 1
            neg_mask = y_train == 0
            n_pos = pos_mask.sum()
            n_neg = neg_mask.sum()

            if n_pos > 0 and (n_pos / len(y_train)) < positive_ratio:
                desired_neg = int(((1 / positive_ratio) - 1) * n_pos)
                if n_neg > desired_neg:
                    # randomly select a subset of negatives with reproducibility
                    selected_neg_indices = rng.choice(y_train[neg_mask].index, size=desired_neg, replace=False)
                    # combine with all positive indices
                    new_train_indices = y_train[pos_mask].index.union(pd.Index(selected_neg_indices))
                    X_train = X_train.loc[new_train_indices]
                    y_train = y_train.loc[new_train_indices]
            # --------------------------------------------------------------------

            # initialize XGBoost model
            xgb_model = xgb.XGBClassifier(eval_metric='logloss', n_jobs=12)

            # train
            xgb_model.fit(X_train, y_train)

            # save model for fold
            model_filename = os.path.join(task_dir, f"{task.replace(' ', '_')}_fold_{fold_idx}.joblib")
            joblib.dump(xgb_model, model_filename)

            # evaluate on test set
            y_pred = xgb_model.predict(X_test)
            y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]  # Get probabilities for ROC AUC and AUPRC

            all_y_true.extend(y_test)
            all_y_pred.extend(y_pred)
            all_y_pred_proba.extend(y_pred_proba)

            for pid, true_label, pred_prob, pred_label in zip(test_patient_ids, y_test, y_pred_proba, y_pred):
                predvalues.append({
                    'patient_ids': pid,
                    'task': task,
                    'true_label': true_label,
                    'predicted_probability': pred_prob,
                    'predicted_label': pred_label
                })

            # get feature importance (gain) for the current fold
            featimp_per_task.append(xgb_model.get_booster().get_score(importance_type='gain'))

        # average feature importance across folds for the current task
        avg_featimp = pd.DataFrame(featimp_per_task).mean(axis=0).to_dict()
        featimp_dict[task] = avg_featimp

        # convert the lists to NumPy arrays for metric calculations
        all_y_true = np.array(all_y_true)
        all_y_pred = np.array(all_y_pred)
        all_y_pred_proba = np.array(all_y_pred_proba)

        # calculate overall metrics
        accuracy = accuracy_score(all_y_true, all_y_pred)
        roc_auc = roc_auc_score(all_y_true, all_y_pred_proba)
        auprc = average_precision_score(all_y_true, all_y_pred_proba)
        prevalence = np.mean(all_y_true)

        # store the results
        results.append({
            'task': task,
            'accuracy': accuracy,
            'roc_auc': roc_auc,
            'auprc': auprc,
            'prevalence': prevalence
        })

    results_df = pd.DataFrame(results)
    predvalues_df = pd.DataFrame(predvalues)

    featimp_df = pd.DataFrame(featimp_dict).T.fillna(0)  # Fill missing values with 0

    return results_df, predvalues_df, featimp_df


In [None]:
timeperiod = '5yrs'
pos_percent = 0.10 # What percentage should be positive cases when subsampling

In [None]:
outputdir = '/path/to/outputdir'
modeloutputdir_realprot = '/path/to/modeldir/task_models_measuredprot'
modeloutputdir_synprot = '/path/to/modeldir/task_models_rabitprot'
modeloutputdir_motorrep = '/path/to/modeldir/task_models_ehrrep'
modeloutputdir_motorrealprot = '/path/to/modeldir/task_models_ehr_measuredprot'
modeloutputdir_motorsynprot = '/path/to/modeldir/task_models_ehr_rabitprot'

# Train disease onset prediction models

In [None]:
real_prot_results, real_prot_predvalues, real_prot_featimp = multi_task_cv_xgboost_dense(real_prot, input_labels, modeloutputdir_realprot, timeperiod, positive_ratio=pos_percent)
real_prot_results.to_csv(f"/{outputdir}/real_prot_{timeperiod}_results.csv", index=False)
real_prot_predvalues.to_csv(f"/{outputdir}/real_prot_{timeperiod}_predvalues.csv", index=False)
real_prot_featimp.to_csv(f"/{outputdir}/real_prot_{timeperiod}_featimp.csv")
real_prot_results

In [None]:
syn_prot_results, syn_prot_predvalues, syn_featimp = multi_task_cv_xgboost_dense(syn_prot, input_labels, modeloutputdir_synprot, timeperiod, positive_ratio=pos_percent)
syn_prot_results.to_csv(f"/{outputdir}/syn_prot_{timeperiod}_results.csv", index=False)
syn_prot_predvalues.to_csv(f"/{outputdir}/syn_prot_{timeperiod}_predvalues.csv", index=False)
syn_featimp.to_csv(f"/{outputdir}/syn_prot_{timeperiod}_featimp.csv")
syn_prot_results

In [None]:
ehr_rep_results, ehr_rep_predvalues, ehr_rep_featimp = multi_task_cv_xgboost_dense(ehr_rep, input_labels, modeloutputdir_motorrep, timeperiod, positive_ratio=pos_percent)
ehr_rep_results.to_csv(f"/{outputdir}/ehr_rep_{timeperiod}_results.csv", index=False)
ehr_rep_predvalues.to_csv(f"/{outputdir}/ehr_rep_{timeperiod}_predvalues.csv", index=False)
ehr_rep_featimp.to_csv(f"/{outputdir}/ehr_rep_{timeperiod}_featimp.csv")
ehr_rep_results

In [None]:
motor_realprot_results, motor_realprot_predvalues, motor_realprot_featimp = multi_task_cv_xgboost_dense(motor_real_prot, input_labels, modeloutputdir_motorrealprot, timeperiod, positive_ratio=pos_percent)
motor_realprot_results.to_csv(f"/{outputdir}/motor_realprot_{timeperiod}_results.csv", index=False)
motor_realprot_predvalues.to_csv(f"/{outputdir}/motor_realprot_{timeperiod}_predvalues.csv", index=False)
motor_realprot_featimp.to_csv(f"/{outputdir}/motor_realprot_{timeperiod}_featimp.csv")
motor_realprot_results

In [None]:
motor_synprot_results, motor_synprot_predvalues, motor_synprot_featimp = multi_task_cv_xgboost_dense(motor_syn_prot, input_labels, modeloutputdir_motorsynprot, timeperiod, positive_ratio=pos_percent)
motor_synprot_results.to_csv(f"/{outputdir}/motor_synprot_{timeperiod}_results.csv", index=False)
motor_synprot_predvalues.to_csv(f"/{outputdir}/motor_synprot_{timeperiod}_predvalues.csv", index=False)
motor_synprot_featimp.to_csv(f"/{outputdir}/motor_synprot_{timeperiod}_featimp.csv")
motor_synprot_results