In [None]:
########################################################################################################################
# This script identifies the patients to be included in the experiment for different experiment settings.
########################################################################################################################

In [1]:
########################################################################################################################
# Import packages
########################################################################################################################
import datetime
import numpy as np
import os
import pandas as pd
import warnings
from itertools import product
pd.set_option('display.float_format', '{:.0f}'.format)
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [None]:
########################################################################################################################
# USER-SPECIFIC SETTING
# PAT_PATH: Path of the patient-level dataset (created in C07_Date_Adjustment.ipynb)
# ENC_PATH: Path of the encounter-level dataset (created in C07_Date_Adjustment.ipynb)
# OUT_DIR_PATH: Path of the directory of the output datasets
########################################################################################################################
PAT_PATH: str = '../00_Data/01_Cleaned_Data/Patient_full_v2.parquet'
ENC_PATH: str = '../00_Data/01_Cleaned_Data/Encounter_full_v2.parquet'
OUT_DIR_PATH: str = '../00_Data/02_Processed_Data/Targets/'

In [None]:
########################################################################################################################
# USER-SPECIFIC SETTING
# Cs: Different numbers of feature encounteres to be included
# Ds: Different maximum widths of the look-back window in days
########################################################################################################################
Cs : list[int] = [1, 2, 3, 4]
Ds : list[int] = [60, 120, 180]

In [None]:
########################################################################################################################
# Define a function to return the target distribution 
########################################################################################################################
def target_dist(df_in: pd.DataFrame, 
                col: str):
    assert col in df_in.columns
    v_count: dict[int, int] = df_in[col].value_counts().sort_index().to_dict()
    v_count_rate: dict[int, float] = df_in[col].value_counts(normalize=True).sort_index().to_dict()
    dist: dict[int, tuple[int, float]] = {k: (v_count[k], f'{round(v_count_rate[k], 3)*100:.1f}%') for k in v_count.keys()}
    dist |= {'Total': (df_in.shape[0], '100.0%')}
    return f'Distribution of target label ({col}): {dist}'

In [3]:
########################################################################################################################
# Load the patient-level dataset
########################################################################################################################
pat_id_col: str = 'PatientDurableKey'

needed_cols: list[str] = [pat_id_col, 'BirthDate']
needed_cols += ['Sex^0=Female', 'Sex^1=Male', 'Sex^2=Other']
needed_cols += ['GenderIdentity^0=Female', 'GenderIdentity^1=Male',
                'GenderIdentity^2=Transgender Male / Female-to-Male',
                'GenderIdentity^3=Transgender Female / Male-to-Female',
                'GenderIdentity!-1=Choose not to disclose']
needed_cols += ['Race^0=American Indian or Alaska Native', 'Race^1=Asian',
                'Race^2=Black or African American', 'Race^3=Native Hawaiian or Other Pacific Islander',
                'Race^4=Other Race', 'Race^5=White', 'Race^6=Multiple Races']
needed_cols += ['Ethnicity^1=Hispanic or Latino']

df_y: pd.DataFrame = pd.read_parquet(PAT_PATH, columns=needed_cols)
print(f'{df_y.shape[0]} patients identified.')

578521 patients identified.


In [4]:
########################################################################################################################
# Load the encounter-level dataset
########################################################################################################################
enc_id_col: str = 'EncounterKey'
enc_date_col: str = 'EncDate'
sa_cur_col: str = 'CurrentSuicideAttempt^1=Y'

df_enc: pd.DataFrame = pd.read_parquet(ENC_PATH, columns=[pat_id_col, enc_id_col, enc_date_col, sa_cur_col])
df_enc[enc_date_col] = df_enc[enc_date_col].dt.date
df_enc.sort_values(by=[pat_id_col, enc_date_col, enc_id_col], ascending=[True, True, True], inplace=True)
print(f'Encounter-level dataset loaded with dimension = {df_enc.shape}')

Encounter-level dataset loaded with dimension = (4243353, 4)


In [5]:
########################################################################################################################
# Merge the two datasets
########################################################################################################################
df_enc_gb: pd.DataFrame = df_enc.sort_values([pat_id_col, enc_date_col]).groupby(pat_id_col, as_index=False).agg({enc_id_col: list,
                                                                                                                  enc_date_col: list,
                                                                                                                  sa_cur_col: list})
df_enc_gb = pd.merge(left=df_y[[pat_id_col, 'BirthDate']], right=df_enc_gb, on=pat_id_col, how='left')

In [None]:
########################################################################################################################
# Display the merged dataset
########################################################################################################################
df_enc_gb

In [10]:
########################################################################################################################
# Description of the subject inclusion criteria
########################################################################################################################
# Rule 1. Requirement of number of encounters
# The patient must have ≥ C+1 encounters (i.e., C feature encounters and 1 outcome encounter).

# Rule 2. Proximity of the most recent feature encounter and the outcome encounter
# Let last_feat_enc_date be the date of the most recent feature encounter.
# Days between outcome and last_feat_enc_date must be in (0, 30].

# Rule 3. Width of look-back window
# Let first_feat_enc_date be the date of the earliest feature encounter. 
# Days between outcome and first_feat_enc_date must be ≤ D.

# Rule 4. Age requirement
# Age at the outcome encounter must be in the range [10, 100].

########################################################################################################################
# Algorithm
########################################################################################################################
# Step 1. Given the specified configuration of C and D, loop over all the patients in the cohort.

# Step 2. For each patient A, set the patient's last encounter stored in the EHR system as the outcome encounter, and 
# the preceding C encounters as feature encounters.

# Step 3. Check whether Rules 1-4 are satisfied. If yes, include patient A with the corresponding set of encounters.
# Otherwise, remove the last encounter of A stored in the EHR system.

# Step 4. If patient A was included in Step 3, move on to the next patient. 
# Otherwise, repeat Step 3 until either A is included or A has ≤ C+1 encounters (violating Rule 1 and thus excluded).

In [None]:
########################################################################################################################
# Loop over the experiment configurations
########################################################################################################################
for exp_idx, (C, D) in enumerate(product(Cs, Ds), 1):
    log_head: str = f'[{exp_idx}. C={C}; D={D}] '
    
    if C == 1 and Ds.index(D) > 0:      # When C=1, all D values are the same
        continue

    ####################################################################################################################
    # Create a list to save the patients (and their encounters) to be included
    ####################################################################################################################
    include_list: list = []

    ####################################################################################################################
    # Loop over each row in df_enc_gb (one row for each patient)
    ####################################################################################################################
    for row_idx, row in df_enc_gb.iterrows():
        
        # R1: With at least C+1 encounters (i.e., C encounters for features, 1 encounter for outcome) 
        if len(row['EncounterKey']) < C+1:
            continue

        # Create a mutation copy of row for subsequent modification
        row_cur = row.copy(deep=True)

        # Start a while-loop to extract a window with C+1 encounters
        # When row_cur has less than C+1 encounters, the patient will be excluded.
        while len(row_cur['EncounterKey']) >= C+1:  

            # (a) Date and key of outcome encounter
            outcome_enc_date: datetime.date = row_cur[enc_date_col][-1]  
            outcome_enc_key: int = row_cur[enc_id_col][-1]

            # (b) Dates and keys of feature encounters 
            feat_enc_dates: list[datetime.date] = row_cur[enc_date_col][-(C+1):-1] if C > 1 else [row_cur[enc_date_col][-2]]
            feat_enc_keys: list[int] = row_cur[enc_id_col][-(C+1):-1] if C > 1 else [row_cur[enc_id_col][-2]]
            assert len(feat_enc_keys) == C, (len(feat_enc_keys), C)
  
            # (c) The latest and earliest dates of feature encounters 
            last_feat_enc_date: datetime.date = feat_enc_dates[-1]
            first_feat_enc_date: datetime.date = feat_enc_dates[0]

            # (d) Difference in days compared to the outcome encounter date
            last_diff_days: int = (pd.to_datetime(outcome_enc_date) - pd.to_datetime(last_feat_enc_date)).days
            first_diff_days: int = (pd.to_datetime(outcome_enc_date) - pd.to_datetime(first_feat_enc_date)).days
 
            # (e) Age at outcome encounter
            enc_age: int = int((pd.to_datetime(outcome_enc_date) - pd.to_datetime(row_cur['BirthDate'])).days / 365)

            # Define R2-R4
            R2: bool = 0 < last_diff_days <= 30
            R3: bool = first_diff_days <= D
            R4: bool = 10 <= enc_age <= 100

            # Check R2-R4
            if R2 and R3 and R4:
                include_record: dict = {pat_id_col: row_cur[pat_id_col],
                                        'OutcomeDate': outcome_enc_date,
                                        'OutcomeEncKey': outcome_enc_key,
                                        'OutcomeEncAge': enc_age,
                                        'OutcomeLabel': row_cur[sa_cur_col][-1],
                                        'FeatureEncKeys': feat_enc_keys,
                                        'FeatureEncDates': feat_enc_dates}     
                include_list.append(include_record)          
                break
            else:               
                for c in [enc_date_col, enc_id_col, sa_cur_col]:            
                    row_cur[c] = row_cur[c][:-1]    # Remove the last encounter and repeat (until inclusion or exclusion)
 
    df_include: pd.DataFrame = pd.DataFrame.from_records(include_list)
    print(f"{log_head}{df_include['OutcomeLabel'].value_counts().to_dict()} ... Prevalence = {df_include['OutcomeLabel'].mean()*100:.2f}%")

    # Merge with df_y
    df_include = pd.merge(left=df_y, right=df_include, on=pat_id_col, how='right')
    
    # Export the result
    os.makedirs(OUT_DIR_PATH, exist_ok=True)
    out_file_path: str = os.path.join(OUT_DIR_PATH, f'{C}_encounters_{D}_days_v1.csv')
    df_include.to_csv(out_file_path, index=False)
    print(f'{log_head}Dataset saved to {out_file_path}')
    print(f'{log_head}Dimension = {df_include.shape}')
    print('-'*120)