This notebook is only for encounter inclusion and exclusion, that is to say, the columns should not change after processing.  

Encounters Inclusion and Exclusion Criteria:
1. Age between 18 and 90.  
2. Exclude patients with pre-existing end stage renal disease (ESRD), dialysis procedure or renal transplantation (RRT) prior to the visit.   
3. Exclude patients who eGFR < 15 mL/min/1.73 m^2 or baseline SCr > 3.5 mg/dL.    
4. SCr trajectories satisfy the requirements (at least one SCr measurement every day of the 3-day observation window).  
5. Each ecnounter's AKI onset date is the most severe AKI stage onset date.  
6. Only hospital-acquired AKI is considered, that is onset 72h after admission.  

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import sys
import os
sys.path.append(os.path.abspath("/home/lideyi/AKI_GNN/notebooks/utils"))
from common_var import raw_path, ct_names, pat_id_cols, race_mapping

# Create a Record Table to Track Encounter Exclusion

In [2]:
encounter_num_df = pd.DataFrame(0, index = ['Total number of encounters', 
                                          'Age between 18 and 90',
                                          'Patients with ESRD, dialysis and RRT excluded',
                                          'Patients with SCr baseline or eGFR out of range excluded',
                                          'Observation window should be complete',
                                          ], 
                              columns = ct_names)

In [3]:
def fill_in_encounter_num_df(screen_item, ct_names, onset_df, encounter_num_df):
    # fill in table
    for ct_name in ct_names:
        ct_enc_n = len(onset_df[onset_df.CENTER_NAME == ct_name])
        print('%s: %s %d'%(ct_name, screen_item, ct_enc_n))
        encounter_num_df.loc[screen_item, ct_name] = ct_enc_n
    return encounter_num_df

# Read Patient ID DataFrame

In [4]:
onset_df = pd.read_csv('/blue/yonghui.wu/lideyi/AKI_GNN/raw_data/onset_df.csv')

  onset_df = pd.read_csv('/blue/yonghui.wu/lideyi/AKI_GNN/raw_data/onset_df.csv')


In [5]:
# type formatting
# all patid should be string
onset_df[pat_id_cols] = onset_df[pat_id_cols].astype(str)
# format date columns
date_cols = ['ADMIT_DATE', 'DISCHARGE_DATE', 'AKI1_ONSET', 'AKI2_ONSET', 'AKI3_ONSET']
for col in date_cols:
    onset_df[col] = pd.to_datetime(onset_df[col], format = 'mixed')

In [6]:
encounter_num_df = fill_in_encounter_num_df('Total number of encounters', ct_names, onset_df, encounter_num_df)

KUMC: Total number of encounters 266199
UPITT: Total number of encounters 587556


# Read Demographics

In [7]:
from read_dfs import read_and_format_DEMO

In [8]:
DEMO_df = read_and_format_DEMO(ct_names, raw_path, race_mapping)
# format type, we need SEX and RACE for eGFR calculation, thus we do not drop them here
DEMO_df[pat_id_cols + ['SEX', 'RACE']] = DEMO_df[pat_id_cols + ['SEX', 'RACE']].astype(str)

In [9]:
# Merge the DataFrames on the specified columns
onset_df = onset_df.merge(DEMO_df, on=pat_id_cols, how = 'left')

# all encounter should have demographics info
assert(onset_df['AGE'].isna().mean() == 0)
assert(onset_df['SEX'].isna().mean() == 0)
assert(onset_df['RACE'].isna().mean() == 0)

# Filter the merged DataFrame to find rows where 'AGE' < 18 or 'AGE' > 90
onset_df = onset_df[(onset_df['AGE'] >= 18) & (onset_df['AGE'] < 90)]

# If you want to reset the index of onset_df after dropping rows
onset_df.reset_index(drop=True, inplace=True)

In [10]:
encounter_num_df = fill_in_encounter_num_df('Age between 18 and 90', ct_names, onset_df, encounter_num_df)

KUMC: Age between 18 and 90 262637
UPITT: Age between 18 and 90 551072


# Read Diagnoses and Filter Encounters with ESRD, Dialysis and RRT

In [11]:
# in the original read_and_format_DX, we have DX df merged with onset_df (patients to use MDRD in that case),
#  however, it is too large here to be merged, thus we do it separately here
from read_dfs import read_DX, concat_dfs_to_one

In [12]:
ESRD_dia_RRT_codes = {
    '9': ['585.6', '39.93','39.95','54.98', 'V45.11', '55.51','55.52','55.53','55.54','55.61','55.69', 'V42.0'],
    
    '10': ['N18.6','5A1D00Z','5A1D60Z','5A1D70Z','5A1D80Z','5A1D90Z', 'Z99.2', '0TY00Z0','0TY00Z1','0TY00Z2',
           '0TY10Z0','0TY10Z1','0TY10Z2','0TB00ZZ','0TB10ZZ','0TT00ZZ','0TT10ZZ','0TT20ZZ', 'Z94.0'],
    
    'CH': [str(cpt) for cpt in range(90935, 91000)] + \
        ['50300','50320','50323','50325','50327','50328','50329','50340','50360','50365','50370','50380']
}

In [13]:
DX_dict = read_DX(ct_names, raw_path)

  0%|          | 0/2 [00:00<?, ?it/s]

  DX_df = pd.read_csv(data_path + "AKI_DX.csv", delimiter = ',')
  DX_df = pd.read_csv(data_path + "AKI_DX.csv", delimiter = ',', usecols=use_cols)
100%|██████████| 2/2 [01:10<00:00, 35.10s/it]


In [14]:
# return encounters related to the input code dict
def get_enc_by_DX_code(DX_dict: dict, pat_df: pd.DataFrame, 
                       code_dict: dict, code_types: list, pat_id_cols: list) -> dict:
    processed_DX_dict = dict()
    ct_missing_DX_DATE = ['UTHSCSA', 'UTSW', 'UofU']
    
    for ct_name, DX_df in tqdm(DX_dict.items()):
        # format type
        DX_df[['PATID', 'DX_TYPE', 'DX']] = DX_df[['PATID', 'DX_TYPE', 'DX']].astype(str)
        DX_df['DX_TYPE'] = DX_df['DX_TYPE'].replace('09', '9')
        DX_df['DX_TYPE'] = DX_df['DX_TYPE'].replace('9.0', '9')
        DX_df['DX_TYPE'] = DX_df['DX_TYPE'].replace('10.0', '10')
        
        # we only care about code-related DX
        DX_in_codes = []
        for code_type in code_types:
            DX_df_temp = DX_df[(DX_df.DX_TYPE == code_type) & (DX_df.DX.isin(code_dict[code_type]))]
            DX_in_codes.append(DX_df_temp)
            
        DX_df = pd.concat(DX_in_codes, axis = 0)
        
        pat_ct_df = pat_df[pat_df.CENTER_NAME == ct_name]
        pat_ct_df = pat_ct_df.merge(DX_df[['PATID', 'DX_DATE', 'DX', 'DX_TYPE', 'DAYS_SINCE_ADMIT']], 
                                    on = 'PATID', how = 'left')
        
        #drop rows do not involed in the codes
        pat_ct_df.dropna(subset=['DX'], inplace = True)
        
        
        # format time cols so that we can filter "future" dx later
        if ct_name not in ct_missing_DX_DATE:
            pat_ct_df['DX_DATE'] = pd.to_datetime(pat_ct_df['DX_DATE'], format = 'mixed')
            pat_ct_df['DX_DATE'] = pat_ct_df['DX_DATE'].dt.strftime('%Y-%m-%d')
            pat_ct_df['DX_DATE'] = pd.to_datetime(pat_ct_df['DX_DATE'], format = 'mixed')
        else:
            pat_ct_df.loc[:, 'DX_DATE'] = pat_ct_df.loc[:, 'ADMIT_DATE'] + \
            pd.to_timedelta(pat_ct_df.loc[:, 'DAYS_SINCE_ADMIT'], unit='D')

        # require that it is "history", filter "future" dx
        pat_ct_df = pat_ct_df[pat_ct_df.DX_DATE < pat_ct_df.ADMIT_DATE]
        
        # keep useful info
        pat_ct_df = pat_ct_df[pat_id_cols]
        processed_DX_dict[ct_name] = pat_ct_df
        
    return processed_DX_dict

In [15]:
enc_to_remove_DX_dict = get_enc_by_DX_code(DX_dict, onset_df, ESRD_dia_RRT_codes, ['9', '10'], pat_id_cols)

100%|██████████| 2/2 [00:32<00:00, 16.15s/it]


In [16]:
enc_to_remove_DX_all = concat_dfs_to_one(enc_to_remove_DX_dict)
enc_to_remove_DX_all.drop_duplicates(inplace = True)
# remove pat_id_cols matched rows from onset_df
merged_df = onset_df.merge(enc_to_remove_DX_all, on=pat_id_cols, how='left', indicator=True)
onset_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')

# Read Procedures

In [17]:
from read_dfs import read_procedures

In [18]:
PX_dict = read_procedures(ct_names, raw_path)

  PX_df = pd.read_csv(data_path + "AKI_PX.csv", delimiter = ',', usecols = ['PATID', 'PX_DATE"+PD.DATE_SHIFT"', 'PX','PX_TYPE'])
100%|██████████| 2/2 [00:45<00:00, 22.61s/it]


In [19]:
# return encounters related to the input code dict
def get_enc_by_PX_code(PX_dict, pat_df, code_dict, code_types, pat_id_cols):
    processed_PX_dict = dict()
    
    for ct_name, PX_df in tqdm(PX_dict.items()):
        # format type
        PX_df[['PATID', 'PX_TYPE', 'PX']] = PX_df[['PATID', 'PX_TYPE', 'PX']].astype(str)
        PX_df['PX_TYPE'] = PX_df['PX_TYPE'].replace('09', '9')
        PX_df['PX_TYPE'] = PX_df['PX_TYPE'].replace('9.0', '9')
        PX_df['PX_TYPE'] = PX_df['PX_TYPE'].replace('10.0', '10')
        
        # we only care about code-related PX, after that we format time
        PX_in_codes = []
        for code_type in code_types:
            PX_df_temp = PX_df[(PX_df.PX_TYPE == code_type) & (PX_df.PX.isin(code_dict[code_type]))]
            PX_in_codes.append(PX_df_temp)
            
        PX_df = pd.concat(PX_in_codes, axis = 0)
        
        pat_ct_df = pat_df[pat_df.CENTER_NAME == ct_name]
        pat_ct_df = pat_ct_df.merge(PX_df, on = 'PATID', how = 'left')
        
        #drop rows do not involed in the codes
        pat_ct_df.dropna(subset=['PX'], inplace = True)
        
        
        # format time cols so that we can filter "future" dx later
        pat_ct_df['PX_DATE'] = pd.to_datetime(pat_ct_df['PX_DATE'], format = 'mixed')
        pat_ct_df['PX_DATE'] = pat_ct_df['PX_DATE'].dt.strftime('%Y-%m-%d')
        pat_ct_df['PX_DATE'] = pd.to_datetime(pat_ct_df['PX_DATE'], format = 'mixed')


        # require that it is "history", filter "future" px
        pat_ct_df = pat_ct_df[pat_ct_df.PX_DATE < pat_ct_df.ADMIT_DATE]
        
        # keep useful info
        pat_ct_df = pat_ct_df[pat_id_cols]
        processed_PX_dict[ct_name] = pat_ct_df
        
    return processed_PX_dict

In [20]:
enc_to_remove_PX_dict = get_enc_by_PX_code(PX_dict, onset_df, ESRD_dia_RRT_codes, ['9', '10', 'CH'], pat_id_cols)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:28<00:00, 14.47s/it]


In [21]:
enc_to_remove_PX_all = concat_dfs_to_one(enc_to_remove_PX_dict)
enc_to_remove_PX_all.drop_duplicates(inplace = True)
# remove pat_id_cols matched rows from onset_df
merged_df = onset_df.merge(enc_to_remove_PX_all, on=pat_id_cols, how='left', indicator=True)
onset_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')

In [22]:
encounter_num_df = fill_in_encounter_num_df('Patients with ESRD, dialysis and RRT excluded', ct_names, onset_df, encounter_num_df)

KUMC: Patients with ESRD, dialysis and RRT excluded 262637
UPITT: Patients with ESRD, dialysis and RRT excluded 551072


# Compute eGFR and Filtered by SCr Baseline and eGFR

In [23]:
#calculate eGFR, based on SCr baseline
def calculate_ckd_epi(row):
    age = row['AGE']
    gender = row['SEX']
    race = row['RACE']
    SCr = row['BASELINE_SCR']
    
    # Constants for the CKD-EPI formula
    k = 0.7 if gender == 'F' else 0.9
    alpha = -0.329 if gender == 'F' else -0.411
    
    # Calculate the eGFR
    min_term = min(SCr / k, 1) ** alpha
    max_term = max(SCr / k, 1) ** -1.209
    age_term = 0.993 ** age
    
    # Gender and ethnicity adjustments
    gender_term = 1.018 if gender == 'F' else 1
    african_american_term = 1.159 if race == "Black" else 1
    
    eGFR = 141 * min_term * max_term * age_term * gender_term * african_american_term
    
    return eGFR

In [24]:
onset_df['EGFR'] = onset_df.apply(calculate_ckd_epi, axis = 1)
# Patients with SCr baseline > 3.5 mg/dL or eGFR < 15 mL/min/1.73 m^2 excluded 
onset_df = onset_df[(onset_df.EGFR > 15) & (onset_df.BASELINE_SCR < 3.5)]

In [25]:
encounter_num_df = fill_in_encounter_num_df('Patients with SCr baseline or eGFR out of range excluded', ct_names, onset_df, encounter_num_df)

KUMC: Patients with SCr baseline or eGFR out of range excluded 262637
UPITT: Patients with SCr baseline or eGFR out of range excluded 551072


# Preprocessing and Sanity Check Before Merging with SCr DataFrame

1. Now we need to establish the prediction point for both AKI and non-AKI encounters for AKI encounters, we need to find the most severe AKI stage onset date, and the prediction point is just 24h before the onset date.  
2. For non-AKI encounters, we need to find the middle SCr measurement date, and the prediction point is just 24h before the last SCr measurement date.

In [26]:
# set up the prediction target column, non-AKI = 0, AKI-1 = 1, AKI-2 = 2, AKI-3 = 3
def set_AKI_target(row):
    if pd.notna(row['AKI3_ONSET']):
        return 3
    elif pd.notna(row['AKI2_ONSET']):
        return 2
    elif pd.notna(row['AKI1_ONSET']):
        return 1
    else:
        return 0

onset_df['AKI_TARGET'] = onset_df.apply(set_AKI_target, axis=1)

In [27]:
# get the percentage of each AKI stage
AKI_stage_percentage = onset_df['AKI_TARGET'].value_counts(normalize=True).sort_index()
AKI_stage_percentage

AKI_TARGET
0    0.768563
1    0.160421
2    0.045809
3    0.025207
Name: proportion, dtype: float64

In [28]:
# get the most severe AKI stage onset date for each encounter
def set_severe_AKI_onset_date(row):
    if pd.notna(row['AKI3_ONSET']):
        return row['AKI3_ONSET']
    elif pd.notna(row['AKI2_ONSET']):
        return row['AKI2_ONSET']
    elif pd.notna(row['AKI1_ONSET']):
        return row['AKI1_ONSET']
    else:
        return pd.NaT

onset_df['AKI_ONSET_DATE'] = onset_df.apply(set_severe_AKI_onset_date, axis=1)

In [29]:
from read_dfs import dup_check

In [30]:
# dups check
dup_check(onset_df, pat_id_cols)
# assert AKI_ONSET_DATE between ADMIT_DATE and DISCHARGE_DATE
AKI_onset_encounters = onset_df[onset_df.AKI_ONSET_DATE.notna()]
assert (AKI_onset_encounters['AKI_ONSET_DATE'] >= AKI_onset_encounters['ADMIT_DATE']).all()
assert (AKI_onset_encounters['AKI_ONSET_DATE'] <= AKI_onset_encounters['DISCHARGE_DATE']).all()
#each center row number above zero
for ct_name in ct_names:
    assert(len(onset_df[onset_df.CENTER_NAME == ct_name]) > 0)
# check two label columns match with each other
assert onset_df.AKI_ONSET_LABEL.sum() == len(onset_df[onset_df.AKI_TARGET > 0])

# of rows before dropping dups:  813709
# of rows after dropping dups:  813709


# Read SCr DataFrame

In [31]:
from read_dfs import read_and_format_SCR

In [32]:
SCR_df = read_and_format_SCR(ct_names, raw_path)

100%|██████████| 2/2 [00:40<00:00, 20.43s/it]
100%|██████████| 2/2 [05:12<00:00, 156.43s/it]


In [33]:
# we only care about non-AKI encounters here since we will use 24h (deprecated. now 0h) 
# before the middle SCr measurement as prediciton points
non_AKI_df = onset_df[onset_df.AKI_ONSET_LABEL == False].copy(deep = True)
# merge on CENTER_NAME and PATID, then filtered by ADMIT_DATE and DISCHARGE_DATE
non_AKI_SCR_df = non_AKI_df.merge(SCR_df[['CENTER_NAME', 'PATID', 'SPECIMEN_DATE', 'RESULT_NUM']], 
                                  on = ['CENTER_NAME', 'PATID'], how = 'left')
non_AKI_SCR_df = non_AKI_SCR_df[(non_AKI_SCR_df.SPECIMEN_DATE >= non_AKI_SCR_df.ADMIT_DATE) & 
                                (non_AKI_SCR_df.SPECIMEN_DATE <= non_AKI_SCR_df.DISCHARGE_DATE)]

In [34]:
# sort based on pat_id_cols and SPECIMEN_DATE
non_AKI_SCR_df.sort_values(pat_id_cols + ['SPECIMEN_DATE'], inplace = True)

# group by pat_id_cols and get the 75th row of each group
def get_mid_percentile_row(group):
    return group.iloc[int(len(group) * 0.50)]

# group by pat_id_cols and get the middle row of each group
non_AKI_mid_SCR_df = non_AKI_SCR_df.groupby(pat_id_cols).progress_apply(get_mid_percentile_row).reset_index(drop=True)

100%|██████████| 625255/625255 [01:23<00:00, 7516.09it/s] 


In [35]:
# sanity check
# we cannot check len(non_AKI_mid_SCR_df) == len(non_AKI_df) since some encounters do not 
# have SCr measurements, for those we just ignore them since they will be dropped latter
# check if there are any duplicates in the middle SCr measurements
assert (len(non_AKI_mid_SCR_df) <= len(non_AKI_df))
dup_check(non_AKI_mid_SCR_df, pat_id_cols)
# check how many encounters the middle SCr measurements were taken on the admission date
print("Middle SCr measuresment overlap with admission date rate: ",
len(non_AKI_mid_SCR_df[non_AKI_mid_SCR_df.SPECIMEN_DATE == non_AKI_mid_SCR_df.ADMIT_DATE]) / len(non_AKI_mid_SCR_df))
# check how many encounters the middle SCr measurements were taken on the discharge date
print("Middle SCr measuresment overlap with discharge date rate: ", 
len(non_AKI_mid_SCR_df[non_AKI_mid_SCR_df.SPECIMEN_DATE == non_AKI_mid_SCR_df.DISCHARGE_DATE]) / len(non_AKI_mid_SCR_df))

# of rows before dropping dups:  625255
# of rows after dropping dups:  625255
Middle SCr measuresment overlap with admission date rate:  0.015121830293240358
Middle SCr measuresment overlap with discharge date rate:  0.033018528440396315


In [36]:
# merge the non_AKI_mid_SCR_df back to onset_df
# rename the SPECIMEN_DATE column first
non_AKI_mid_SCR_df.rename(columns = {'SPECIMEN_DATE': 'MID_SCR_DATE'}, inplace = True)
onset_df = onset_df.merge(non_AKI_mid_SCR_df[pat_id_cols + ['MID_SCR_DATE']], on = pat_id_cols, how = 'left')

In [37]:
# drop those with missing AKI_ONSET_DATE and MID_SCR_DATE
onset_df = onset_df[onset_df.AKI_ONSET_DATE.notna() | onset_df.MID_SCR_DATE.notna()]

In [38]:
# Assert conditions for onset_df
assert (onset_df[onset_df['AKI_ONSET_DATE'].notna()]['MID_SCR_DATE'].isna()).all(), "If AKI_ONSET_DATE is not NaT, MID_SCR_DATE should be NaT"
assert (onset_df[onset_df['AKI_ONSET_DATE'].notna()]['AKI_TARGET'] > 0).all(), "If AKI_ONSET_DATE is not NaT, AKI_TARGET should be > 0"
assert (onset_df[onset_df['AKI_ONSET_DATE'].notna()]['AKI_ONSET_LABEL'] == True).all(), "If AKI_ONSET_DATE is not NaT, AKI_ONSET_LABEL should be True"

assert (onset_df[onset_df['AKI_ONSET_DATE'].isna()]['MID_SCR_DATE'].notna()).all(), "If AKI_ONSET_DATE is NaT, MID_SCR_DATE should not be NaT"
assert (onset_df[onset_df['AKI_ONSET_DATE'].isna()]['AKI_TARGET'] == 0).all(), "If AKI_ONSET_DATE is NaT, AKI_TARGET should be 0"
assert (onset_df[onset_df['AKI_ONSET_DATE'].isna()]['AKI_ONSET_LABEL'] == False).all(), "If AKI_ONSET_DATE is NaT, AKI_ONSET_LABEL should be False"

In [39]:
# Create PREDICTION_POINT column: the 24h (deprecated, now 0h) before the non-NaT date between AKI_ONSET_DATE and MID_SCR_DATE
onset_df.loc[:, 'PREDICTION_POINT'] = onset_df[['AKI_ONSET_DATE', 'MID_SCR_DATE']].min(axis=1) - pd.Timedelta(days=0)

In [40]:
ob_window_len = 2
onset_df.loc[:, 'OBSERVATION_WINDOW_START'] = onset_df['PREDICTION_POINT'] - pd.Timedelta(days=ob_window_len - 1)
onset_df.loc[:, 'OBSERVATION_WINDOW_TO_ADMIT'] = (onset_df['OBSERVATION_WINDOW_START'] - onset_df['ADMIT_DATE']).dt.days

In [41]:
AKI_0 = onset_df[onset_df['AKI_TARGET'] == 0]
AKI_1 = onset_df[onset_df['AKI_TARGET'] == 1]
AKI_2 = onset_df[onset_df['AKI_TARGET'] == 2]
AKI_3 = onset_df[onset_df['AKI_TARGET'] == 3]
window_pct_0 = (AKI_0['OBSERVATION_WINDOW_TO_ADMIT'] < 0).mean() * 100
window_pct_1 = (AKI_1['OBSERVATION_WINDOW_TO_ADMIT'] < 0).mean() * 100
window_pct_2 = (AKI_2['OBSERVATION_WINDOW_TO_ADMIT'] < 0).mean() * 100
window_pct_3 = (AKI_3['OBSERVATION_WINDOW_TO_ADMIT'] < 0).mean() * 100

print(f"Percentage of OBSERVATION_WINDOW_TO_ADMIT < 0 when AKI-0: {window_pct_0:.2f}%")
print(f"Percentage of OBSERVATION_WINDOW_TO_ADMIT < 0 when AKI-1: {window_pct_1:.2f}%")
print(f"Percentage of OBSERVATION_WINDOW_TO_ADMIT < 0 when AKI-2: {window_pct_2:.2f}%")
print(f"Percentage of OBSERVATION_WINDOW_TO_ADMIT < 0 when AKI-3: {window_pct_3:.2f}%")

Percentage of OBSERVATION_WINDOW_TO_ADMIT < 0 when AKI-0: 1.51%
Percentage of OBSERVATION_WINDOW_TO_ADMIT < 0 when AKI-1: 29.41%
Percentage of OBSERVATION_WINDOW_TO_ADMIT < 0 when AKI-2: 47.04%
Percentage of OBSERVATION_WINDOW_TO_ADMIT < 0 when AKI-3: 34.16%


In [42]:
# here we require that the observation window (48h) is complete
onset_df = onset_df[onset_df['OBSERVATION_WINDOW_TO_ADMIT'] >= 0]

In [43]:
# get the percentage of each AKI stage
AKI_stage_percentage = onset_df['AKI_TARGET'].value_counts(normalize=True).sort_index()
AKI_stage_percentage

AKI_TARGET
0    0.830832
1    0.124317
2    0.026632
3    0.018219
Name: proportion, dtype: float64

In [44]:
encounter_num_df = fill_in_encounter_num_df('Observation window should be complete', ct_names, onset_df, encounter_num_df)

KUMC: Observation window should be complete 244259
UPITT: Observation window should be complete 496926


In [45]:
encounter_num_df

Unnamed: 0,KUMC,UPITT
Total number of encounters,266199,587556
Age between 18 and 90,262637,551072
"Patients with ESRD, dialysis and RRT excluded",262637,551072
Patients with SCr baseline or eGFR out of range excluded,262637,551072
Observation window should be complete,244259,496926


In [46]:
onset_df

Unnamed: 0,PATID,ONSETS_ENCOUNTERID,ADMIT_DATE,DISCHARGE_DATE,CENTER_NAME,BASELINE_SCR,AKI1_ONSET,AKI2_ONSET,AKI3_ONSET,AKI_ONSET_LABEL,AGE,SEX,RACE,EGFR,AKI_TARGET,AKI_ONSET_DATE,MID_SCR_DATE,PREDICTION_POINT,OBSERVATION_WINDOW_START,OBSERVATION_WINDOW_TO_ADMIT
0,611629,980022,2012-02-05,2012-02-09,KUMC,0.950000,NaT,NaT,NaT,False,68,M,White,81.918005,0,NaT,2012-02-07,2012-02-07,2012-02-06,1
1,611629,810534,2012-01-31,2012-02-22,KUMC,1.000000,2012-02-05,NaT,NaT,True,68,M,White,76.992284,1,2012-02-05,NaT,2012-02-05,2012-02-04,4
2,611651,8458359,2019-06-28,2019-07-01,KUMC,0.550000,NaT,NaT,NaT,False,39,F,White,118.153207,0,NaT,2019-06-30,2019-06-30,2019-06-29,1
3,611651,6840293,2019-07-03,2019-07-31,KUMC,0.590000,NaT,NaT,NaT,False,39,F,White,115.455473,0,NaT,2019-07-15,2019-07-15,2019-07-14,11
4,611651,6833322,2019-06-15,2019-06-19,KUMC,0.492826,NaT,NaT,NaT,False,39,F,White,122.497892,0,NaT,2019-06-16,2019-06-16,2019-06-15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813704,PIT991837,5076016390310_0922,2014-09-30,2014-10-02,UPITT,0.815000,NaT,NaT,NaT,False,29,F,White,97.415728,0,NaT,2014-10-01,2014-10-01,2014-09-30,0
813705,PIT995907,5088062911236_0112,2014-03-01,2014-03-04,UPITT,0.716667,NaT,NaT,NaT,False,54,F,White,95.470971,0,NaT,2014-03-02,2014-03-02,2014-03-01,0
813706,PIT995946,5080489850558_0202,2014-03-05,2014-03-07,UPITT,0.600000,NaT,NaT,NaT,False,32,F,White,120.605840,0,NaT,2014-03-07,2014-03-07,2014-03-06,1
813707,PIT996157,5070474240494_0818,2015-03-29,2015-04-01,UPITT,0.600000,NaT,NaT,NaT,False,50,F,White,106.280799,0,NaT,2015-03-31,2015-03-31,2015-03-30,1


In [50]:
# drop columns that are not needed
onset_df_cleaned = onset_df.drop(columns = ['EGFR', 'AKI1_ONSET', 'AKI2_ONSET', 'AKI3_ONSET', 'AKI_ONSET_LABEL', 
                         'AKI_ONSET_DATE', 'MID_SCR_DATE',  'OBSERVATION_WINDOW_TO_ADMIT'])
# reset index
onset_df_cleaned.reset_index(drop=True, inplace=True)
# reorder columns
onset_df_cleaned = onset_df_cleaned[['CENTER_NAME', 'PATID', 'ONSETS_ENCOUNTERID', 'ADMIT_DATE', 'DISCHARGE_DATE', 
                                     'AGE', 'SEX', 'RACE', 'BASELINE_SCR', 'OBSERVATION_WINDOW_START','PREDICTION_POINT', 'AKI_TARGET']]
# save the cleaned onset_df
onset_df_cleaned.to_csv('/blue/yonghui.wu/lideyi/AKI_GNN/raw_data/onset_df_cleaned.csv', index = False)