In [12]:
########################################################################################################################
# This script runs ElasticNet modeling.
########################################################################################################################

In [3]:
########################################################################################################################
# Import packages
########################################################################################################################
import numpy as np
import os
import pandas as pd
import warnings
from _Helper_Scripts.ablation import ablate
from _Helper_Scripts.binary_metrics import binary_metrics, flagged_at_top_k_ppv, nb_weight_from_pt, threshold_at_specificity_k
from _Helper_Scripts.result_organizer import optimize_width
from itertools import product
from joblib import parallel_backend
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from time import time
from typing import Literal, Optional
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
np.random.seed(42)
print('Packages loaded')

Packages loaded


In [5]:
########################################################################################################################
# USER_SPECIFIC SETTING
# C_LIST: A list of different numbers of feature encounteres to be included
# D_LIST: A list of different maximum widths of the look-back window in days
# CSL_LIST: A list of boolean indicating whether the perform cost-sensitive learning (CSL)
# IMPUTE_LIST: A list of strings representing the imputation methods adopted
# ABLATION: Boolean. False for pre-ablation modeling and True for post-ablation modeling
# CS_GRID: A list of cost multipliers to penalize false positives
# MAX_ITER: An integer representing the maximum number of iterations for Elastic Net modeling
# N_FOLDS: An integer representing the number of folds for cross-validation
########################################################################################################################
C_LIST: list[int] = [1]                         # Number of encounters (Default: [1])
D_LIST: list[int] = [60]                        # Width of look-back window (Default: [60])
CSL_LIST: list[bool] = [False, True]            # Whether to perfrom cost-sensitive learning (Default: [False, True])
IMPUTE_LIST: list[str] = ['Zero', 'Mean', 'Median'] 
                                                # Method of imputation (Default: ['Zero', 'Mean', 'Median']
ABLATION: bool = False                          # Whether to perform model ablation (Default: False. Use True only after False)
CS_GRID: list[int] = [2, 3, 4]                  # Grid for cost multipliers (Default: [2, 3, 4])
MAX_ITER: int = 1000                            # Maximum number of iterations for modeling
N_FOLDS: int = 5                                # Number of folds for cross-validation (Default: 5)

In [6]:
########################################################################################################################
# USER_SPECIFIC SETTING
# IN_DIR_PATH: Path of the input directory storing the organized datasets for modeling
# (created in P06_Point_Data_Preparation.ipynb)
########################################################################################################################
IN_DIR_PATH: str = '../00_Data/02_Processed_Data/Point_Model_Data/'

In [7]:
########################################################################################################################
# 1. Define a scoring function (recall evaluated at 99th percentile of specificity) for cross-validation
########################################################################################################################
def recall_at_spec(y_true, y_prob):
    pos = (y_true == 1).sum()
    neg = (y_true == 0).sum()
    if pos == 0 or neg == 0:
        return 0
    try:
        tau = np.quantile(y_prob[y_true == 0], 0.99, method='linear')
        tau = float(np.nextafter(tau, np.inf))
    except Exception:
        return 0.0
    y_pred = (y_prob >= tau).astype(int)
    tp = ((y_true == 1) & (y_pred == 1)).sum()
    return tp / pos

def recall_at_spec_sklearn(yt, yp):
    y_score = np.asarray(yp)
    if y_score.ndim == 2:
        y_score = y_score[:, 1]
    return recall_at_spec(yt, y_score)

recall_at_spec_scorer = make_scorer(recall_at_spec_sklearn, needs_proba=True)

In [12]:
########################################################################################################################
# 2. Define a function to create an ElasticNet model embedded in a GridSearchCV
########################################################################################################################
def create_EN(csl=True):
    Cs = np.logspace(-2, 2, 10)
    if csl:
        class_weight_dict: list[dict[int, float]] = [{0: x, 1: 1} for x in CS_GRID]
        M_base = LogisticRegression(penalty='elasticnet',
                                    solver='saga',
                                    l1_ratio=0.5,
                                    max_iter=MAX_ITER,
                                    tol=1e-5,
                                    n_jobs=-1,
                                    random_state=42)
        param_grid = {'class_weight': class_weight_dict, 'C': Cs}
        M = GridSearchCV(estimator=M_base,
                         param_grid=param_grid,
                         scoring=recall_at_spec_scorer,
                         cv=N_FOLDS,
                         n_jobs=-1,
                         refit=True)
    else:
        M = LogisticRegressionCV(Cs=Cs, 
                                 cv=N_FOLDS, 
                                 penalty='elasticnet', 
                                 scoring=recall_at_spec_scorer,
                                 solver='saga',
                                 tol=1e-5,
                                 max_iter=MAX_ITER,
                                 n_jobs=-1,
                                 refit=True,
                                 l1_ratios=[0.5],
                                 random_state=42)
    return M

In [11]:
########################################################################################################################
# 3. Define a function to train an ElasticNet model
########################################################################################################################
def train_EN(M, X_train, y_train):
    with parallel_backend('threading'):
        M.fit(X_train, y_train)
    return M

In [10]:
########################################################################################################################
# 4. Define a function to evaluate an ElasticNet model
########################################################################################################################
def eval_EN(model, X_test, y_test, prefix=''):
    t0 = time()
    y_prob = model.predict_proba(X_test)[:, 1]
    t1 = time()
    threshold_tuple_list: list[tuple[float, str]] = [(0.5, ''),
                                                    (flagged_at_top_k_ppv(y_prob, k=1), '@Precision1%'),
                                                    (flagged_at_top_k_ppv(y_prob, k=2), '@Precision2%'),
                                                    (flagged_at_top_k_ppv(y_prob, k=5), '@Precision5%'),
                                                    (threshold_at_specificity_k(y_test, y_prob, 99), '@99Spec'),
                                                    (threshold_at_specificity_k(y_test, y_prob, 95), '@95Spec'),
                                                    (threshold_at_specificity_k(y_test, y_prob, 90), '@90Spec')]
    nbw = nb_weight_from_pt(1/11)
    output_dict: dict[str, float] = {}
    for threshold_tuple in threshold_tuple_list:
        suffix: str = threshold_tuple[1]
        n_params: int = X_test.shape[1]
        cur_result: dict[str, float] = binary_metrics(y_true=y_test,
                                                      y_prob=y_prob,
                                                      y_pred_override=None if (suffix == '' or 'Spec' in suffix) else threshold_tuple[0],
                                                      threshold=0.5 if (suffix == '' or 'Precision' in suffix) else threshold_tuple[0],
                                                      nb_weight=nbw,
                                                      n_params=n_params,
                                                      decimals=5,
                                                      verbose=False,
                                                      prefix=prefix)
        output_dict |= {f'{k}{suffix}': v for k, v in cur_result.items()}
    return output_dict, round(t1-t0, 3)

In [13]:
########################################################################################################################
# 5. Define a function to extract the linear coefficients (as feature importance) of a ElasticNet model
########################################################################################################################
def get_coef(model, feature_names):
    try:
        coef = model.coef_[0]
    except AttributeError:
        coef = model.best_estimator_.coef_[0]
    return pd.DataFrame({'Feature': feature_names,
                         'Coefficient': coef,   
                         'Absolute_Coefficient': np.abs(coef)})

In [14]:
########################################################################################################################
# 6. Define a function to load the data
########################################################################################################################
def data_load(C: int,
              D: int,
              impute: str,
              feats: Optional[list[str]] = None):

    # Specify the paths of the datasets
    dir_path: str = os.path.join(IN_DIR_PATH, f'{C}_encounters_{D}_days/', f'{impute}/')
    X_train_path: str = f'{dir_path}X_train.npy'
    X_test_path: str = X_train_path.replace('train', 'test')
    y_train_path: str = f'{dir_path}y_train.npy'
    y_test_path: str = y_train_path.replace('train', 'test')
    feat_name_path: str = f'{dir_path}Feature_Names.csv'

    # Load the datasets
    X_train: np.ndarray = np.load(X_train_path, allow_pickle=True)
    X_test: np.ndarray = np.load(X_test_path, allow_pickle=True)
    y_train: np.ndarray = np.load(y_train_path, allow_pickle=True)
    y_test: np.ndarray = np.load(y_test_path, allow_pickle=True)
    feat_names: list[str] = pd.read_csv(feat_name_path)['Features'].to_list()

    # Specify the name of the dataset
    data_str: str = f'{C}_encounters_{D}_days_{impute}'

    # Truncate the feature datasets if needed (for ablation purposes)
    if feats is not None:
        assert set(feats).issubset(feat_names)
        idxs: list[int] = [feat_names.index(f) for f in feats]
        X_train = X_train[:, idxs]
        X_test = X_test[:, idxs]
        data_str += '_Ablated'

    # Specify the features being used in the dataset
    feats_out = feat_names if feats is None else feats

    # Return the datasets, features, and the name of the dataset    
    return X_train, X_test, y_train, y_test, feats_out, data_str

In [22]:
########################################################################################################################
# 7. Define an overall function to run Elastic Net modeling
########################################################################################################################
def run_EN_pipeline(C: int,
                    D: int,
                    impute: str,
                    feats: Optional[list[str]] = None,
                    csl: bool = True):
   
    # Load the dataset
    X_train, X_test, y_train, y_test, feat_names, data_str = data_load(C=C, D=D, impute=impute, feats=feats)

    # Logging
    log_head: str = f'[C={C}; D={D}; impute={impute}; CSL={csl}] '    
    if csl:
        data_str += '_CSL'
        
    # Create and fit the model
    M = create_EN(csl=csl)
    t0: float = time()
    M = train_EN(M, X_train, y_train)
    train_elapsed: float = round(time() - t0, 3)
    print(f'{log_head}Training took {train_elapsed} seconds.')
    
    # Identify the optimal cost ratio multiplier
    opt_cost_mul = M.best_params_['class_weight'][0] if csl else 'NONE'
    print(f'{log_head}Optimized cost multiplier={opt_cost_mul}')
    
    # Identify the optimal regularization term (inversed)
    opt_reg = M.best_params_['C'] if csl else M.C_[0]
    print(f'{log_head}Optimized inversed regularization term={opt_reg}')

    # Evaluate the model
    train_result: dict[str, float] = eval_EN(M, X_train, y_train, prefix='Train_')[0]
    test_result, test_elapsed = eval_EN(M, X_test, y_test, prefix='Test_')
    print(f'{log_head}Basic evaluation completed.')

    # Organize the results
    final_result: dict[str, float] = {'Algorithm': 'Elastic_Net',
                                      'Model_Width': opt_reg,
                                      '#Encounters': C,
                                      'LookBackDays': D,
                                      'Impute': impute,
                                      'Experiment_Name': data_str,
                                      'Features': 'All' if feats is None else 'Ablated',
                                      'Cost_Ratio': opt_cost_mul,
                                      'Train_Sample_Size': X_train.shape[0],
                                      'Test_Sample_Size': X_test.shape[0],
                                      'Feature_Size': X_train.shape[1],
                                      'Prevalence': np.round(np.mean(y_train), 3),
                                      'Training_Time_Seconds': train_elapsed,
                                      'Test_Time_Seconds': test_elapsed}
    final_result |= test_result | train_result
    df_coef: pd.DataFrame = get_coef(M, feat_names)
    df_y: pd.DataFrame = pd.DataFrame({'y_test': y_test, 'y_prob': M.predict_proba(X_test)[:, 1]})
    return final_result, df_coef, df_y

In [None]:
########################################################################################################################
# 8. Run all the experiments (and store results in the current working directory)
########################################################################################################################
all_results: list[dict] = []
out_dir_path: str = 'EN_Results/' if not ABLATION else 'EN_Results_Ablated/'
os.makedirs(out_dir_path, exist_ok=True)

for exp_idx, (C, D, csl, impute) in enumerate(product(C_LIST, D_LIST, CSL_LIST, IMPUTE_LIST), 1):
    if C == 1 and D != 60:
        continue                # When C=1, all D values are the same

    # Logging
    log_head: str = f'[Exp {exp_idx}. C={C}; D={D}; impute={impute}; CSL={csl}; Ablation={ABLATION}] '    
    print(f'{log_head}Starting experiment...')

    # Perform feature ablation if needed
    if ABLATION:
        print(f'{log_head}Performing ablation...')
        shap_filename: str = f'{C}_encounters_{D}_days_{impute}{"_CSL" if csl else ""}.csv'
        df_shap: pd.DataFrame = pd.read_csv(f'EN_Results/COEF/{shap_filename}')
        elbow_dir_path: str = f'{out_dir_path}/COEF/Elbow_Images/'
        os.makedirs(elbow_dir_path, exist_ok=True)
        elbow_filename: str = elbow_dir_path + shap_filename.replace('.csv', '_elbow.png')
        feats: list[str] = ablate(df_shap, 'Absolute_Coefficient', elbow_filename)
    else:
        feats = None

    # Run Elastic Net modeling
    cur_result, df_coef, df_y = run_EN_pipeline(C=C, D=D, impute=impute, feats=feats, csl=csl)
    print(f'{log_head}Experiment completed --> {cur_result["Experiment_Name"]}')

    # Save the linear coefficients
    out_sub_dir_path: str = f'{out_dir_path}COEF/'
    os.makedirs(out_sub_dir_path, exist_ok=True)
    out_file_path: str = f'{out_sub_dir_path}{cur_result["Experiment_Name"]}.csv'
    df_coef.to_csv(out_file_path, index=False)

    # Save the predicted probabilities
    out_sub_dir_path: str = f'{out_dir_path}Predicted_Probabilities/'
    os.makedirs(out_sub_dir_path, exist_ok=True)
    out_file_path: str = f'{out_sub_dir_path}{cur_result["Experiment_Name"]}.csv'
    df_y.to_csv(out_file_path, index=False)

    # Concatenate the result
    all_results.append(cur_result)
    print('*'*120)

    # Organize the current version of all_results as a pandas.DataFrame
    df_out: pd.DataFrame = pd.DataFrame.from_records(all_results)
    df_out.drop(columns=[col for col in df_out.columns if 'LIST' in col], inplace=True)

    # Save (and overwrite) the current version of the exported df_out
    out_file_path: str = f'{out_dir_path}Experiment_EN{"_Ablated" if ABLATION else ""}_{C}_{D}.xlsx'
    df_out.to_excel(out_file_path, index=False)
    optimize_width(out_file_path)
    print('Modeling result saved.')
    print('*'*120)