In [12]:
########################################################################################################################
# This script runs XGBoost modeling.
########################################################################################################################

In [3]:
########################################################################################################################
# Import packages
########################################################################################################################
import numpy as np
import os
import pandas as pd
import xgboost as xgb
import warnings
from _Helper_Scripts.ablation import ablate
from _Helper_Scripts.binary_metrics import binary_metrics, flagged_at_top_k_ppv, nb_weight_from_pt, threshold_at_specificity_k
from _Helper_Scripts.result_organizer import optimize_width
from itertools import product
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold, train_test_split
from time import time
from typing import Literal, Optional
from xgboost import XGBClassifier
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
np.random.seed(42)
print('Packages loaded')

Packages loaded


In [5]:
########################################################################################################################
# USER_SPECIFIC SETTING
# C_LIST: A list of different numbers of feature encounteres to be included
# D_LIST: A list of different maximum widths of the look-back window in days
# CSL_LIST: A list of boolean indicating whether the perform cost-sensitive learning (CSL)
# IMPUTE_LIST: A list of strings representing the imputation methods adopted
# ABLATION: Boolean. False for pre-ablation modeling and True for post-ablation modeling
# CS_GRID: A list of cost multipliers to penalize false positives
# EST_GRID: A list of integers representing the number of estimators for model's structural optimization
# N_LAYERS: An integer representing the number of hidden layers in the model
# N_FOLDS: An integer representing the number of folds for cross-validation
########################################################################################################################
C_LIST: list[int] = [1]                         # Number of encounters (Default: [1])
D_LIST: list[int] = [60]                        # Width of look-back window (Default: [60])
CSL_LIST: list[bool] = [False, True]            # Whether to perfrom cost-sensitive learning (Default: [False, True])
IMPUTE_LIST: list[str] = ['Zero', 'Mean', 'Median'] 
                                                # Method of imputation (Default: ['Zero', 'Mean', 'Median']
ABLATION: bool = False                          # Whether to perform model ablation (Default: False. Use True only after False)
CS_GRID: list[int] = [2, 3, 4]                  # Grid for cost multipliers (Default: [2, 3, 4])
EST_GRID: list[int] = [16, 32, 64, 128, 256, 512, 1024]
                                                # Number of estimators of the XGBoost model 
                                                # (Default: [16, 32, 64, 128, 256, 512, 1024])
N_FOLDS: int = 5                                # Number of folds for cross-validation (Default: 5)

In [1]:
########################################################################################################################
# USER_SPECIFIC SETTING
# IN_DIR_PATH: Path of the input directory storing the organized datasets for modeling
# (created in P06_Point_Data_Preparation.ipynb)
########################################################################################################################
IN_DIR_PATH: str = '../00_Data/02_Processed_Data/Point_Model_Data/'

In [6]:
########################################################################################################################
# 1. Define a function to create a XGBoost model
########################################################################################################################
def create_xgb(y_train, n_estimators=1000):
    params = dict(objective='binary:logistic',
                  tree_method='hist',
                  n_jobs=-1,
                  random_state=42,
                  eval_metric=['logloss'],
                  n_estimators=n_estimators,
                  learning_rate=0.025,
                  max_depth=5,
                  min_child_weight=7,
                  subsample=0.8,
                  colsample_bytree=0.8,
                  gamma=0.5,
                  reg_alpha=0.0,
                  reg_lambda=3.0,
                  max_delta_step=1,
                  missing=np.nan)
    return XGBClassifier(**params)

In [9]:
########################################################################################################################
# 2. Define a function to train a XGBoost model (with internal validation)
########################################################################################################################
def train_xgb(M, X_train, y_train, X_val, y_val, cost_mul=1):
    M.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
          sample_weight=np.where(y_train==0, cost_mul, 1).astype(np.float32),
          verbose=False)
    return M

In [10]:
########################################################################################################################
# 3. Define a function to evaluate a XGBoost model
########################################################################################################################
def eval_xgb(model, X_test, y_test, prefix=''):
    t0 = time()
    y_prob = model.predict_proba(X_test)[:, 1]
    B = model.get_booster()
    num_leaves = sum(int(tree.count(':leaf')) for tree in B.get_dump())
    t1 = time()
    threshold_tuple_list: list[tuple[float, str]] = [(0.5, ''),
                                                    (flagged_at_top_k_ppv(y_prob, k=1), '@Precision1%'),
                                                    (flagged_at_top_k_ppv(y_prob, k=2), '@Precision2%'),
                                                    (flagged_at_top_k_ppv(y_prob, k=5), '@Precision5%'),
                                                    (threshold_at_specificity_k(y_test, y_prob, 99), '@99Spec'),
                                                    (threshold_at_specificity_k(y_test, y_prob, 95), '@95Spec'),
                                                    (threshold_at_specificity_k(y_test, y_prob, 90), '@90Spec')]
    nbw = nb_weight_from_pt(1/11)
    output_dict: dict[str, float] = {}
    for threshold_tuple in threshold_tuple_list:
        threshold: float = threshold_tuple[0]
        suffix: str = threshold_tuple[1]
        n_params: int = num_leaves
        cur_result: dict[str, float] = binary_metrics(y_true=y_test,
                                                      y_prob=y_prob,
                                                      y_pred_override=None if (suffix == '' or 'Spec' in suffix) else threshold_tuple[0],
                                                      threshold=0.5 if (suffix == '' or 'Precision' in suffix) else threshold_tuple[0],
                                                      nb_weight=nbw,
                                                      n_params=n_params,
                                                      decimals=5,
                                                      verbose=False,
                                                      prefix=prefix)
        output_dict |= {f'{k}{suffix}': v for k, v in cur_result.items()}
    return output_dict, round(t1-t0, 3)

In [11]:
########################################################################################################################
# 4. Define a function to extract the SHAP feature importance of a XGBoost classification model
########################################################################################################################
def get_shap(model, X_test, feature_names):
    dtest = xgb.DMatrix(X_test, feature_names=feature_names, missing=np.nan)
    shap_values = model.get_booster().predict(dtest, pred_contribs=True)
    shap_values = shap_values[:, :-1]
    mean_abs_shap = np.abs(shap_values).mean(axis=0)
    return pd.DataFrame({'Feature': feature_names,
                         'MeanAbsSHAP': mean_abs_shap}).sort_values(by='MeanAbsSHAP', ascending=False)

In [12]:
########################################################################################################################
# 5. Define a function to load the data
########################################################################################################################
def data_load(C: int,
              D: int,
              impute: str,
              feats: Optional[list[str]] = None):

    # Specify the paths of the datasets
    dir_path: str = os.path.join(IN_DIR_PATH, f'{C}_encounters_{D}_days/', f'{impute}/')
    X_train_path: str = f'{dir_path}X_train.npy'
    X_test_path: str = X_train_path.replace('train', 'test')
    y_train_path: str = f'{dir_path}y_train.npy'
    y_test_path: str = y_train_path.replace('train', 'test')
    feat_name_path: str = f'{dir_path}Feature_Names.csv'

    # Load the datasets
    X_train: np.ndarray = np.load(X_train_path, allow_pickle=True)
    X_test: np.ndarray = np.load(X_test_path, allow_pickle=True)
    y_train: np.ndarray = np.load(y_train_path, allow_pickle=True)
    y_test: np.ndarray = np.load(y_test_path, allow_pickle=True)
    feat_names: list[str] = pd.read_csv(feat_name_path)['Features'].to_list()

    # Specify the name of the dataset
    data_str: str = f'{C}_encounters_{D}_days_{impute}'

    # Truncate the feature datasets if needed (for ablation purposes)
    if feats is not None:
        assert set(feats).issubset(feat_names)
        idxs: list[int] = [feat_names.index(f) for f in feats]
        X_train = X_train[:, idxs]
        X_test = X_test[:, idxs]
        data_str += '_Ablated'

    # Specify the features being used in the dataset
    feats_out = feat_names if feats is None else feats

    # Return the datasets, features, and the name of the dataset    
    return X_train, X_test, y_train, y_test, feats_out, data_str

In [9]:
########################################################################################################################
# 6. Define an overall function to run XGBoost with cross-validation
########################################################################################################################
def run_XGB_opt(C: int,
                D: int,
                impute: str,
                feats: Optional[list[str]] = None,
                csl: bool = True):                  # csl: cost-sensitive learning
    
    # Load the dataset
    X_train, X_test, y_train, y_test, feat_names, data_str = data_load(C=C, D=D, impute=impute, feats=feats)
    prevalence: float = np.mean(y_train)

    # Logging
    log_head: str = f'[C={C}; D={D}; impute={impute}; CSL={csl}] '    
    if csl:
        data_str += '_CSL'

    # Define the starting time of the optimization pipeline
    t0_out: float = time()

    # Prepare a dictionary to store the performance statistics
    if csl:
        result_dict: dict[tuple[int, int], list] = {(est, cost_mul): [] 
                                                    for est, cost_mul in product(EST_GRID, CS_GRID)}
    else:
        result_dict: dict[int, list] = {est: [] for est in EST_GRID}

    # Loop over the full grid
    full_grid = product(EST_GRID, CS_GRID) if csl else EST_GRID
    for grid_element in full_grid:
        if csl:
            est, cost_mul = grid_element
        else:
            est, cost_mul = grid_element, 1

        # Define the folds for cross_validation
        skf = StratifiedKFold(n_splits=N_FOLDS, random_state=42, shuffle=True)

        # Loop over the k-fold:
        for k_idx, (train, val) in enumerate(skf.split(X_train, y_train), 1):
            X_train_cur, X_val_cur = np.take(X_train, train, axis=0), np.take(X_train, val, axis=0)
            y_train_cur, y_val_cur = np.take(y_train, train), np.take(y_train, val)
        
            # Logging
            cost_mul_log = cost_mul if cost_mul != 1 else 'NONE'
            log_head_sub = f'{log_head}(#Est={est}; cost_multiplier={cost_mul_log}; Fold={k_idx}/5) '

            # Create the XGBoost model
            M = create_xgb(y_train_cur, n_estimators=est)
            
            # In-fold fitting
            t0: float = time()
            M = train_xgb(M, X_train_cur, y_train_cur, X_val_cur, y_val_cur, cost_mul=cost_mul)
            train_elapsed: float = round(time() - t0, 3)
            print(f'{log_head_sub}Training took {train_elapsed} seconds.')

            # In-fold evaluation
            val_result: dict[str, float] = eval_xgb(M, X_val_cur, y_val_cur, prefix='Val_')[0]
            val_result = {'#Est': est, 'Cost_multiplier': cost_mul} | {k: v for k, v in val_result.items() if 'LIST' not in k}
            result_dict[grid_element].append(val_result)
            print('-'*120)
        
        # Store the averaged validation performance statistics
        result_dict[grid_element] = pd.DataFrame(pd.DataFrame.from_records(result_dict[grid_element]).mean()).T

    print(f'{log_head}XGBoost optimization completed in {round(time() - t0_out, 3)} seconds.')
    return result_dict

In [10]:
########################################################################################################################
# 7. Define an overall function to generate the modeling result
########################################################################################################################
def run_XGB_pipeline(C: int,
                     D: int, 
                     impute: str,
                     feats: Optional[list[str]] = None, 
                     csl: bool = True):

    # Logging
    log_head: str = f'[C={C}; D={D}; impute={impute}; CSL={csl}] '

    # Run XGBoost with n_estimator optimization
    opt_results: dict = run_XGB_opt(C=C, D=D, impute=impute, feats=feats, csl=csl)

    # Identify the optimized width which has the highest averaged validation metric
    metric: str = 'Val_Recall@99Spec' if csl else 'Val_AUPRC'
    df_opt: pd.DataFrame = pd.concat(opt_results.values()).reset_index(drop=True)
    opt_est: int = int(df_opt.loc[df_opt[metric].idxmax(), '#Est'])
    print(f'{log_head}Optimized model n_estimator={opt_est}')
    if csl: 
        opt_cost_mul: int = int(df_opt.loc[df_opt[metric].idxmax(), 'Cost_multiplier'])
        print(f'{log_head}Optimized cost multiplier={opt_cost_mul}')
    else:
        opt_cost_mul = 1

    # Re-load the dataset and form 8:2 stratified partition for training and validation
    X_train, X_test, y_train, y_test, feat_names, data_str = data_load(C=C, D=D, impute=impute, feats=feats)
    X_train_cur, X_val_cur, y_train_cur, y_val_cur = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)
    if csl:
        data_str += '_CSL'

    # Create and refit the model with the optimized width (and optimal cost multiplier if CSL)
    M = create_xgb(y_train_cur, n_estimators=opt_est)
    t0: float = time()
    M = train_xgb(M, X_train_cur, y_train_cur, X_val_cur, y_val_cur, cost_mul=opt_cost_mul)
    train_elapsed: float = round(time() - t0, 3)
    print(f'{log_head}Training took {train_elapsed} seconds.')

    # Held-out evaluation
    train_result: dict[str, float] = eval_xgb(M, X_train, y_train, prefix='Train_')[0]
    test_result, test_elapsed = eval_xgb(M, X_test, y_test, prefix='Test_')
    print(f'{log_head}Basic evaluation completed.')

    # Organize the results
    final_result: dict[str, float] = {'Algorithm': 'XGBoost',
                                      'Model_Width': opt_est,
                                      '#Encounters': C,
                                      'LookBackDays': D,
                                      'Impute': impute,
                                      'Experiment_Name': data_str,
                                      'Features': 'All' if feats is None else 'Ablated',
                                      'Cost_Ratio': opt_cost_mul if csl else 'NONE',
                                      'Train_Sample_Size': X_train.shape[0],
                                      'Test_Sample_Size': X_test.shape[0],
                                      'Feature_Size': X_train.shape[1],
                                      'Prevalence': np.round(np.mean(y_train), 3),
                                      'Training_Time_Seconds': train_elapsed,
                                      'Test_Time_Seconds': test_elapsed}
    final_result |= test_result | train_result    
    df_shap: pd.DataFrame = get_shap(M, X_test, feat_names)
    df_y: pd.DataFrame = pd.DataFrame({'y_test': y_test, 'y_prob': M.predict_proba(X_test)[:, 1]})    
    return final_result, df_shap, df_opt, df_y

In [None]:
########################################################################################################################
# 8. Run all the experiments (and store results in the current working directory)
########################################################################################################################
all_results: list[dict] = []
out_dir_path: str = 'XGB_Results/' if not ABLATION else 'XGB_Results_Ablated/'
os.makedirs(out_dir_path, exist_ok=True)

for exp_idx, (C, D, csl, impute) in enumerate(product(C_LIST, D_LIST, CSL_LIST, IMPUTE_LIST), 1):
    if C == 1 and D != 60:
        continue                # When C=1, all D values are the same

    # Logging 
    log_head: str = f'[Exp {exp_idx}. C={C}; D={D}; impute={impute}; CSL={csl}; Ablation={ABLATION}] '    
    print(f'{log_head}Starting experiment...')

    # Perform feature ablation if needed
    if ABLATION:
        print(f'{log_head}Performing ablation...')
        shap_filename: str = f'{C}_encounters_{D}_days_{impute}{"_CSL" if csl else ""}.csv'
        df_shap: pd.DataFrame = pd.read_csv(f'XGB_Results/SHAP/{shap_filename}')
        elbow_dir_path: str = f'{out_dir_path}/SHAP/Elbow_Images/'
        os.makedirs(elbow_dir_path, exist_ok=True)
        elbow_filename: str = elbow_dir_path + shap_filename.replace('.csv', '_elbow.png')
        feats: list[str] = ablate(df_shap, 'MeanAbsSHAP', elbow_filename)
    else:
        feats = None

    # Run XGB modeling
    cur_result, df_shap, df_opt, df_y = run_XGB_pipeline(C=C, D=D, impute=impute, feats=feats, csl=csl)
    print(f'{log_head}Experiment completed --> {cur_result["Experiment_Name"]}')

    # Save the mean-absolute SHAP values
    out_sub_dir_path: str = f'{out_dir_path}SHAP/'
    os.makedirs(out_sub_dir_path, exist_ok=True)
    out_file_path: str = f'{out_sub_dir_path}{cur_result["Experiment_Name"]}.csv'
    df_shap.to_csv(out_file_path, index=False)

    # Save the cross-validation statistics
    out_sub_dir_path: str = f'{out_dir_path}Validation_Statistics/'
    os.makedirs(out_sub_dir_path, exist_ok=True)
    out_file_path: str = f'{out_sub_dir_path}{cur_result["Experiment_Name"]}.csv'
    df_opt.to_csv(out_file_path, index=False)

    # Save the predicted probabilities
    out_sub_dir_path: str = f'{out_dir_path}Predicted_Probabilities/'
    os.makedirs(out_sub_dir_path, exist_ok=True)
    out_file_path: str = f'{out_sub_dir_path}{cur_result["Experiment_Name"]}.csv'
    df_y.to_csv(out_file_path, index=False)

    # Concatenate the cur_result to all_results
    all_results.append(cur_result)
    print('*'*120)

    # Organize the current version of all_results as a pandas.DataFrame
    df_out: pd.DataFrame = pd.DataFrame.from_records(all_results)
    df_out.drop(columns=[col for col in df_out.columns if 'LIST' in col], inplace=True)

    # Save (and overwrite) the current version of the exported df_out
    out_file_path: str = f'{out_dir_path}Experiment_XGB{"_Ablated" if ABLATION else ""}_{C}_{D}.xlsx'
    df_out.to_excel(out_file_path, index=False)
    optimize_width(out_file_path)
    print('Modeling result saved.')
    print('*'*120)